lorenzo-final/lib/python3.12/site-packages/markdown/htmlparser.py

# Python Markdown

# A Python implementation of John Gruber's Markdown.

# Documentation: https://python-markdown.github.io/
# GitHub: https://github.com/Python-Markdown/markdown/
# PyPI: https://pypi.org/project/Markdown/

# Started by Manfred Stienstra (http://www.dwerg.net/).
# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).
# Currently maintained by Waylan Limberg (https://github.com/waylan),
# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).

# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)
# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)
# Copyright 2004 Manfred Stienstra (the original version)

# License: BSD (see LICENSE.md for details).

"""
This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
A copy is imported rather than the module being directly imported as this ensures that the user can import
and  use the unmodified library for their own needs.
"""

from __future__ import annotations

import re
import importlib.util
import sys
from typing import TYPE_CHECKING, Sequence

if TYPE_CHECKING:  # pragma: no cover
    from markdown import Markdown


# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
# Users can still do `from html import parser` and get the default behavior.
spec = importlib.util.find_spec('html.parser')
htmlparser = importlib.util.module_from_spec(spec)
spec.loader.exec_module(htmlparser)
sys.modules['htmlparser'] = htmlparser

# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
htmlparser.piclose = re.compile(r'\?>')
# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')
# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.
htmlparser.incomplete = htmlparser.entityref
# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
htmlparser.locatestarttagend_tolerant = re.compile(r"""
  <[a-zA-Z][^`\t\n\r\f />\x00]*       # tag name <= added backtick here
  (?:[\s/]*                           # optional whitespace before attribute name
    (?:(?<=['"\s/])[^`\s/>][^\s/=>]*  # attribute name <= added backtick here
      (?:\s*=+\s*                     # value indicator
        (?:'[^']*'                    # LITA-enclosed value
          |"[^"]*"                    # LIT-enclosed value
          |(?!['"])[^`>\s]*           # bare value <= added backtick here
         )
         (?:\s*,)*                    # possibly followed by a comma
       )?(?:\s|/(?!>))*
     )*
   )?
  \s*                                 # trailing whitespace
""", re.VERBOSE)

# Match a blank line at the start of a block of text (two newlines).
# The newlines may be preceded by additional whitespace.
blank_line_re = re.compile(r'^([ ]*\n){2}')


class HTMLExtractor(htmlparser.HTMLParser):
    """
    Extract raw HTML from text.

    The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
    [`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
    is stored in `cleandoc` as a list of strings.
    """

    def __init__(self, md: Markdown, *args, **kwargs):
        if 'convert_charrefs' not in kwargs:
            kwargs['convert_charrefs'] = False

        # Block tags that should contain no content (self closing)
        self.empty_tags = set(['hr'])

        self.lineno_start_cache = [0]

        # This calls self.reset
        super().__init__(*args, **kwargs)
        self.md = md

    def reset(self):
        """Reset this instance.  Loses all unprocessed data."""
        self.inraw = False
        self.intail = False
        self.stack: list[str] = []  # When `inraw==True`, stack contains a list of tags
        self._cache: list[str] = []
        self.cleandoc: list[str] = []
        self.lineno_start_cache = [0]

        super().reset()

    def close(self):
        """Handle any buffered data."""
        super().close()
        if len(self.rawdata):
            # Temp fix for https://bugs.python.org/issue41989
            # TODO: remove this when the bug is fixed in all supported Python versions.
            if self.convert_charrefs and not self.cdata_elem:  # pragma: no cover
                self.handle_data(htmlparser.unescape(self.rawdata))
            else:
                self.handle_data(self.rawdata)
        # Handle any unclosed tags.
        if len(self._cache):
            self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
            self._cache = []

    @property
    def line_offset(self) -> int:
        """Returns char index in `self.rawdata` for the start of the current line. """
        for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):
            last_line_start_pos = self.lineno_start_cache[ii]
            lf_pos = self.rawdata.find('\n', last_line_start_pos)
            if lf_pos == -1:
                # No more newlines found. Use end of raw data as start of line beyond end.
                lf_pos = len(self.rawdata)
            self.lineno_start_cache.append(lf_pos+1)

        return self.lineno_start_cache[self.lineno-1]

    def at_line_start(self) -> bool:
        """
        Returns True if current position is at start of line.

        Allows for up to three blank spaces at start of line.
        """
        if self.offset == 0:
            return True
        if self.offset > 3:
            return False
        # Confirm up to first 3 chars are whitespace
        return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''

    def get_endtag_text(self, tag: str) -> str:
        """
        Returns the text of the end tag.

        If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
        """
        # Attempt to extract actual tag from raw source text
        start = self.line_offset + self.offset
        m = htmlparser.endendtag.search(self.rawdata, start)
        if m:
            return self.rawdata[start:m.end()]
        else:  # pragma: no cover
            # Failed to extract from raw data. Assume well formed and lowercase.
            return '</{}>'.format(tag)

    def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):
        # Handle tags that should always be empty and do not specify a closing tag
        if tag in self.empty_tags:
            self.handle_startendtag(tag, attrs)
            return

        if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):
            # Started a new raw block. Prepare stack.
            self.inraw = True
            self.cleandoc.append('\n')

        text = self.get_starttag_text()
        if self.inraw:
            self.stack.append(tag)
            self._cache.append(text)
        else:
            self.cleandoc.append(text)
            if tag in self.CDATA_CONTENT_ELEMENTS:
                # This is presumably a standalone tag in a code span (see #1036).
                self.clear_cdata_mode()

    def handle_endtag(self, tag: str):
        text = self.get_endtag_text(tag)

        if self.inraw:
            self._cache.append(text)
            if tag in self.stack:
                # Remove tag from stack
                while self.stack:
                    if self.stack.pop() == tag:
                        break
            if len(self.stack) == 0:
                # End of raw block.
                if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):
                    # Preserve blank line and end of raw block.
                    self._cache.append('\n')
                else:
                    # More content exists after `endtag`.
                    self.intail = True
                # Reset stack.
                self.inraw = False
                self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))
                # Insert blank line between this and next line.
                self.cleandoc.append('\n\n')
                self._cache = []
        else:
            self.cleandoc.append(text)

    def handle_data(self, data: str):
        if self.intail and '\n' in data:
            self.intail = False
        if self.inraw:
            self._cache.append(data)
        else:
            self.cleandoc.append(data)

    def handle_empty_tag(self, data: str, is_block: bool):
        """ Handle empty tags (`<data>`). """
        if self.inraw or self.intail:
            # Append this to the existing raw block
            self._cache.append(data)
        elif self.at_line_start() and is_block:
            # Handle this as a standalone raw block
            if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):
                # Preserve blank line after tag in raw block.
                data += '\n'
            else:
                # More content exists after tag.
                self.intail = True
            item = self.cleandoc[-1] if self.cleandoc else ''
            # If we only have one newline before block element, add another
            if not item.endswith('\n\n') and item.endswith('\n'):
                self.cleandoc.append('\n')
            self.cleandoc.append(self.md.htmlStash.store(data))
            # Insert blank line between this and next line.
            self.cleandoc.append('\n\n')
        else:
            self.cleandoc.append(data)

    def handle_startendtag(self, tag: str, attrs):
        self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))

    def handle_charref(self, name: str):
        self.handle_empty_tag('&#{};'.format(name), is_block=False)

    def handle_entityref(self, name: str):
        self.handle_empty_tag('&{};'.format(name), is_block=False)

    def handle_comment(self, data: str):
        self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)

    def handle_decl(self, data: str):
        self.handle_empty_tag('<!{}>'.format(data), is_block=True)

    def handle_pi(self, data: str):
        self.handle_empty_tag('<?{}?>'.format(data), is_block=True)

    def unknown_decl(self, data: str):
        end = ']]>' if data.startswith('CDATA[') else ']>'
        self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)

    def parse_pi(self, i: int) -> int:
        if self.at_line_start() or self.intail:
            return super().parse_pi(i)
        # This is not the beginning of a raw block so treat as plain data
        # and avoid consuming any tags which may follow (see #1066).
        self.handle_data('<?')
        return i + 2

    def parse_html_declaration(self, i: int) -> int:
        if self.at_line_start() or self.intail:
            return super().parse_html_declaration(i)
        # This is not the beginning of a raw block so treat as plain data
        # and avoid consuming any tags which may follow (see #1066).
        self.handle_data('<!')
        return i + 2

    def parse_bogus_comment(self, i: int, report: int = 0) -> int:
        # Override the default behavior so that bogus comments get passed
        # through unaltered by setting `report` to `0` (see #1425).
        pos = super().parse_bogus_comment(i, report)
        if pos == -1:  # pragma: no cover
            return -1
        self.handle_empty_tag(self.rawdata[i:pos], is_block=False)
        return pos

    # The rest has been copied from base class in standard lib to address #1036.
    # As `__startag_text` is private, all references to it must be in this subclass.
    # The last few lines of `parse_starttag` are reversed so that `handle_starttag`
    # can override `cdata_mode` in certain situations (in a code span).
    __starttag_text: str | None = None

    def get_starttag_text(self) -> str:
        """Return full source of start tag: `<...>`."""
        return self.__starttag_text

    def parse_starttag(self, i: int) -> int:  # pragma: no cover
        self.__starttag_text = None
        endpos = self.check_for_whole_start_tag(i)
        if endpos < 0:
            return endpos
        rawdata = self.rawdata
        self.__starttag_text = rawdata[i:endpos]

        # Now parse the data between `i+1` and `j` into a tag and `attrs`
        attrs = []
        match = htmlparser.tagfind_tolerant.match(rawdata, i+1)
        assert match, 'unexpected call to parse_starttag()'
        k = match.end()
        self.lasttag = tag = match.group(1).lower()
        while k < endpos:
            m = htmlparser.attrfind_tolerant.match(rawdata, k)
            if not m:
                break
            attrname, rest, attrvalue = m.group(1, 2, 3)
            if not rest:
                attrvalue = None
            elif attrvalue[:1] == '\'' == attrvalue[-1:] or \
                 attrvalue[:1] == '"' == attrvalue[-1:]:  # noqa: E127
                attrvalue = attrvalue[1:-1]
            if attrvalue:
                attrvalue = htmlparser.unescape(attrvalue)
            attrs.append((attrname.lower(), attrvalue))
            k = m.end()

        end = rawdata[k:endpos].strip()
        if end not in (">", "/>"):
            lineno, offset = self.getpos()
            if "\n" in self.__starttag_text:
                lineno = lineno + self.__starttag_text.count("\n")
                offset = len(self.__starttag_text) \
                         - self.__starttag_text.rfind("\n")  # noqa: E127
            else:
                offset = offset + len(self.__starttag_text)
            self.handle_data(rawdata[i:endpos])
            return endpos
        if end.endswith('/>'):
            # XHTML-style empty tag: `<span attr="value" />`
            self.handle_startendtag(tag, attrs)
        else:
            # *** set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) ***
            if tag in self.CDATA_CONTENT_ELEMENTS:
                self.set_cdata_mode(tag)
            self.handle_starttag(tag, attrs)
        return endpos
first commit 4 months ago			`# Python Markdown`

			`# A Python implementation of John Gruber's Markdown.`

			`# Documentation: https://python-markdown.github.io/`
			`# GitHub: https://github.com/Python-Markdown/markdown/`
			`# PyPI: https://pypi.org/project/Markdown/`

			`# Started by Manfred Stienstra (http://www.dwerg.net/).`
			`# Maintained for a few years by Yuri Takhteyev (http://www.freewisdom.org).`
			`# Currently maintained by Waylan Limberg (https://github.com/waylan),`
			`# Dmitry Shachnev (https://github.com/mitya57) and Isaac Muse (https://github.com/facelessuser).`

			`# Copyright 2007-2023 The Python Markdown Project (v. 1.7 and later)`
			`# Copyright 2004, 2005, 2006 Yuri Takhteyev (v. 0.2-1.6b)`
			`# Copyright 2004 Manfred Stienstra (the original version)`

			`# License: BSD (see LICENSE.md for details).`

			`"""`
			This module imports a copy of [`html.parser.HTMLParser`][] and modifies it heavily through monkey-patches.
			`A copy is imported rather than the module being directly imported as this ensures that the user can import`
			`and use the unmodified library for their own needs.`
			`"""`

			`from __future__ import annotations`

			`import re`
			`import importlib.util`
			`import sys`
			`from typing import TYPE_CHECKING, Sequence`

			`if TYPE_CHECKING: # pragma: no cover`
			`from markdown import Markdown`


			# Import a copy of the html.parser lib as `htmlparser` so we can monkeypatch it.
			# Users can still do `from html import parser` and get the default behavior.
			`spec = importlib.util.find_spec('html.parser')`
			`htmlparser = importlib.util.module_from_spec(spec)`
			`spec.loader.exec_module(htmlparser)`
			`sys.modules['htmlparser'] = htmlparser`

			# Monkeypatch `HTMLParser` to only accept `?>` to close Processing Instructions.
			`htmlparser.piclose = re.compile(r'\?>')`
			# Monkeypatch `HTMLParser` to only recognize entity references with a closing semicolon.
			`htmlparser.entityref = re.compile(r'&([a-zA-Z][-.a-zA-Z0-9]*);')`
			# Monkeypatch `HTMLParser` to no longer support partial entities. We are always feeding a complete block,
			# so the 'incomplete' functionality is unnecessary. As the `entityref` regex is run right before incomplete,
			`# and the two regex are the same, then incomplete will simply never match and we avoid the logic within.`
			`htmlparser.incomplete = htmlparser.entityref`
			# Monkeypatch `HTMLParser` to not accept a backtick in a tag name, attribute name, or bare value.
			`htmlparser.locatestarttagend_tolerant = re.compile(r"""`
			<[a-zA-Z][^`\t\n\r\f />\x00]* # tag name <= added backtick here
			`(?:[\s/]* # optional whitespace before attribute name`
			(?:(?<=['"\s/])[^`\s/>][^\s/=>]* # attribute name <= added backtick here
			`(?:\s=+\s # value indicator`
			`(?:'[^']*' # LITA-enclosed value`
			`\|"[^"]*" # LIT-enclosed value`
			\|(?!['"])[^`>\s]* # bare value <= added backtick here
			`)`
			`(?:\s,) # possibly followed by a comma`
			`)?(?:\s\|/(?!>))*`
			`)*`
			`)?`
			`\s* # trailing whitespace`
			`""", re.VERBOSE)`

			`# Match a blank line at the start of a block of text (two newlines).`
			`# The newlines may be preceded by additional whitespace.`
			`blank_line_re = re.compile(r'^([ ]*\n){2}')`


			`class HTMLExtractor(htmlparser.HTMLParser):`
			`"""`
			`Extract raw HTML from text.`

			The raw HTML is stored in the [`htmlStash`][markdown.util.HtmlStash] of the
			[`Markdown`][markdown.Markdown] instance passed to `md` and the remaining text
			is stored in `cleandoc` as a list of strings.
			`"""`

			`def __init__(self, md: Markdown, args, *kwargs):`
			`if 'convert_charrefs' not in kwargs:`
			`kwargs['convert_charrefs'] = False`

			`# Block tags that should contain no content (self closing)`
			`self.empty_tags = set(['hr'])`

			`self.lineno_start_cache = [0]`

			`# This calls self.reset`
			`super().__init__(args, *kwargs)`
			`self.md = md`

			`def reset(self):`
			`"""Reset this instance. Loses all unprocessed data."""`
			`self.inraw = False`
			`self.intail = False`
			self.stack: list[str] = [] # When `inraw==True`, stack contains a list of tags
			`self._cache: list[str] = []`
			`self.cleandoc: list[str] = []`
			`self.lineno_start_cache = [0]`

			`super().reset()`

			`def close(self):`
			`"""Handle any buffered data."""`
			`super().close()`
			`if len(self.rawdata):`
			`# Temp fix for https://bugs.python.org/issue41989`
			`# TODO: remove this when the bug is fixed in all supported Python versions.`
			`if self.convert_charrefs and not self.cdata_elem: # pragma: no cover`
			`self.handle_data(htmlparser.unescape(self.rawdata))`
			`else:`
			`self.handle_data(self.rawdata)`
			`# Handle any unclosed tags.`
			`if len(self._cache):`
			`self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))`
			`self._cache = []`

			`@property`
			`def line_offset(self) -> int:`
			"""Returns char index in `self.rawdata` for the start of the current line. """
			`for ii in range(len(self.lineno_start_cache)-1, self.lineno-1):`
			`last_line_start_pos = self.lineno_start_cache[ii]`
			`lf_pos = self.rawdata.find('\n', last_line_start_pos)`
			`if lf_pos == -1:`
			`# No more newlines found. Use end of raw data as start of line beyond end.`
			`lf_pos = len(self.rawdata)`
			`self.lineno_start_cache.append(lf_pos+1)`

			`return self.lineno_start_cache[self.lineno-1]`

			`def at_line_start(self) -> bool:`
			`"""`
			`Returns True if current position is at start of line.`

			`Allows for up to three blank spaces at start of line.`
			`"""`
			`if self.offset == 0:`
			`return True`
			`if self.offset > 3:`
			`return False`
			`# Confirm up to first 3 chars are whitespace`
			`return self.rawdata[self.line_offset:self.line_offset + self.offset].strip() == ''`

			`def get_endtag_text(self, tag: str) -> str:`
			`"""`
			`Returns the text of the end tag.`

			If it fails to extract the actual text from the raw data, it builds a closing tag with `tag`.
			`"""`
			`# Attempt to extract actual tag from raw source text`
			`start = self.line_offset + self.offset`
			`m = htmlparser.endendtag.search(self.rawdata, start)`
			`if m:`
			`return self.rawdata[start:m.end()]`
			`else: # pragma: no cover`
			`# Failed to extract from raw data. Assume well formed and lowercase.`
			`return '</{}>'.format(tag)`

			`def handle_starttag(self, tag: str, attrs: Sequence[tuple[str, str]]):`
			`# Handle tags that should always be empty and do not specify a closing tag`
			`if tag in self.empty_tags:`
			`self.handle_startendtag(tag, attrs)`
			`return`

			`if self.md.is_block_level(tag) and (self.intail or (self.at_line_start() and not self.inraw)):`
			`# Started a new raw block. Prepare stack.`
			`self.inraw = True`
			`self.cleandoc.append('\n')`

			`text = self.get_starttag_text()`
			`if self.inraw:`
			`self.stack.append(tag)`
			`self._cache.append(text)`
			`else:`
			`self.cleandoc.append(text)`
			`if tag in self.CDATA_CONTENT_ELEMENTS:`
			`# This is presumably a standalone tag in a code span (see #1036).`
			`self.clear_cdata_mode()`

			`def handle_endtag(self, tag: str):`
			`text = self.get_endtag_text(tag)`

			`if self.inraw:`
			`self._cache.append(text)`
			`if tag in self.stack:`
			`# Remove tag from stack`
			`while self.stack:`
			`if self.stack.pop() == tag:`
			`break`
			`if len(self.stack) == 0:`
			`# End of raw block.`
			`if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(text):]):`
			`# Preserve blank line and end of raw block.`
			`self._cache.append('\n')`
			`else:`
			# More content exists after `endtag`.
			`self.intail = True`
			`# Reset stack.`
			`self.inraw = False`
			`self.cleandoc.append(self.md.htmlStash.store(''.join(self._cache)))`
			`# Insert blank line between this and next line.`
			`self.cleandoc.append('\n\n')`
			`self._cache = []`
			`else:`
			`self.cleandoc.append(text)`

			`def handle_data(self, data: str):`
			`if self.intail and '\n' in data:`
			`self.intail = False`
			`if self.inraw:`
			`self._cache.append(data)`
			`else:`
			`self.cleandoc.append(data)`

			`def handle_empty_tag(self, data: str, is_block: bool):`
			""" Handle empty tags (`<data>`). """
			`if self.inraw or self.intail:`
			`# Append this to the existing raw block`
			`self._cache.append(data)`
			`elif self.at_line_start() and is_block:`
			`# Handle this as a standalone raw block`
			`if blank_line_re.match(self.rawdata[self.line_offset + self.offset + len(data):]):`
			`# Preserve blank line after tag in raw block.`
			`data += '\n'`
			`else:`
			`# More content exists after tag.`
			`self.intail = True`
			`item = self.cleandoc[-1] if self.cleandoc else ''`
			`# If we only have one newline before block element, add another`
			`if not item.endswith('\n\n') and item.endswith('\n'):`
			`self.cleandoc.append('\n')`
			`self.cleandoc.append(self.md.htmlStash.store(data))`
			`# Insert blank line between this and next line.`
			`self.cleandoc.append('\n\n')`
			`else:`
			`self.cleandoc.append(data)`

			`def handle_startendtag(self, tag: str, attrs):`
			`self.handle_empty_tag(self.get_starttag_text(), is_block=self.md.is_block_level(tag))`

			`def handle_charref(self, name: str):`
			`self.handle_empty_tag('&#{};'.format(name), is_block=False)`

			`def handle_entityref(self, name: str):`
			`self.handle_empty_tag('&{};'.format(name), is_block=False)`

			`def handle_comment(self, data: str):`
			`self.handle_empty_tag('<!--{}-->'.format(data), is_block=True)`

			`def handle_decl(self, data: str):`
			`self.handle_empty_tag('<!{}>'.format(data), is_block=True)`

			`def handle_pi(self, data: str):`
			`self.handle_empty_tag('<?{}?>'.format(data), is_block=True)`

			`def unknown_decl(self, data: str):`
			`end = ']]>' if data.startswith('CDATA[') else ']>'`
			`self.handle_empty_tag('<![{}{}'.format(data, end), is_block=True)`

			`def parse_pi(self, i: int) -> int:`
			`if self.at_line_start() or self.intail:`
			`return super().parse_pi(i)`
			`# This is not the beginning of a raw block so treat as plain data`
			`# and avoid consuming any tags which may follow (see #1066).`
			`self.handle_data('<?')`
			`return i + 2`

			`def parse_html_declaration(self, i: int) -> int:`
			`if self.at_line_start() or self.intail:`
			`return super().parse_html_declaration(i)`
			`# This is not the beginning of a raw block so treat as plain data`
			`# and avoid consuming any tags which may follow (see #1066).`
			`self.handle_data('<!')`
			`return i + 2`

			`def parse_bogus_comment(self, i: int, report: int = 0) -> int:`
			`# Override the default behavior so that bogus comments get passed`
			# through unaltered by setting `report` to `0` (see #1425).
			`pos = super().parse_bogus_comment(i, report)`
			`if pos == -1: # pragma: no cover`
			`return -1`
			`self.handle_empty_tag(self.rawdata[i:pos], is_block=False)`
			`return pos`

			`# The rest has been copied from base class in standard lib to address #1036.`
			# As `__startag_text` is private, all references to it must be in this subclass.
			# The last few lines of `parse_starttag` are reversed so that `handle_starttag`
			# can override `cdata_mode` in certain situations (in a code span).
			`__starttag_text: str \| None = None`

			`def get_starttag_text(self) -> str:`
			"""Return full source of start tag: `<...>`."""
			`return self.__starttag_text`

			`def parse_starttag(self, i: int) -> int: # pragma: no cover`
			`self.__starttag_text = None`
			`endpos = self.check_for_whole_start_tag(i)`
			`if endpos < 0:`
			`return endpos`
			`rawdata = self.rawdata`
			`self.__starttag_text = rawdata[i:endpos]`

			# Now parse the data between `i+1` and `j` into a tag and `attrs`
			`attrs = []`
			`match = htmlparser.tagfind_tolerant.match(rawdata, i+1)`
			`assert match, 'unexpected call to parse_starttag()'`
			`k = match.end()`
			`self.lasttag = tag = match.group(1).lower()`
			`while k < endpos:`
			`m = htmlparser.attrfind_tolerant.match(rawdata, k)`
			`if not m:`
			`break`
			`attrname, rest, attrvalue = m.group(1, 2, 3)`
			`if not rest:`
			`attrvalue = None`
			`elif attrvalue[:1] == '\'' == attrvalue[-1:] or \`
			`attrvalue[:1] == '"' == attrvalue[-1:]: # noqa: E127`
			`attrvalue = attrvalue[1:-1]`
			`if attrvalue:`
			`attrvalue = htmlparser.unescape(attrvalue)`
			`attrs.append((attrname.lower(), attrvalue))`
			`k = m.end()`

			`end = rawdata[k:endpos].strip()`
			`if end not in (">", "/>"):`
			`lineno, offset = self.getpos()`
			`if "\n" in self.__starttag_text:`
			`lineno = lineno + self.__starttag_text.count("\n")`
			`offset = len(self.__starttag_text) \`
			`- self.__starttag_text.rfind("\n") # noqa: E127`
			`else:`
			`offset = offset + len(self.__starttag_text)`
			`self.handle_data(rawdata[i:endpos])`
			`return endpos`
			`if end.endswith('/>'):`
			# XHTML-style empty tag: `<span attr="value" />`
			`self.handle_startendtag(tag, attrs)`
			`else:`
			# * set `cdata_mode` first so we can override it in `handle_starttag` (see #1036) *
			`if tag in self.CDATA_CONTENT_ELEMENTS:`
			`self.set_cdata_mode(tag)`
			`self.handle_starttag(tag, attrs)`
			`return endpos`