"""
self._parse(stream, True, *args, **kwargs)
return self.tree.getFragment()
def parseError(self, errorcode="XXX-undefined-error", datavars=None):
# XXX The idea is to make errorcode mandatory.
if datavars is None:
datavars = {}
self.errors.append((self.tokenizer.stream.position(), errorcode, datavars))
if self.strict:
raise ParseError(E[errorcode] % datavars)
def adjustMathMLAttributes(self, token):
adjust_attributes(token, adjustMathMLAttributes)
def adjustSVGAttributes(self, token):
adjust_attributes(token, adjustSVGAttributes)
def adjustForeignAttributes(self, token):
adjust_attributes(token, adjustForeignAttributesMap)
def reparseTokenNormal(self, token):
# pylint:disable=unused-argument
self.parser.phase()
def resetInsertionMode(self):
# The name of this method is mostly historical. (It's also used in the
# specification.)
last = False
newModes = {
"select": "inSelect",
"td": "inCell",
"th": "inCell",
"tr": "inRow",
"tbody": "inTableBody",
"thead": "inTableBody",
"tfoot": "inTableBody",
"caption": "inCaption",
"colgroup": "inColumnGroup",
"table": "inTable",
"head": "inBody",
"body": "inBody",
"frameset": "inFrameset",
"html": "beforeHead"
}
for node in self.tree.openElements[::-1]:
nodeName = node.name
new_phase = None
if node == self.tree.openElements[0]:
assert self.innerHTML
last = True
nodeName = self.innerHTML
# Check for conditions that should only happen in the innerHTML
# case
if nodeName in ("select", "colgroup", "head", "html"):
assert self.innerHTML
if not last and node.namespace != self.tree.defaultNamespace:
continue
if nodeName in newModes:
new_phase = self.phases[newModes[nodeName]]
break
elif last:
new_phase = self.phases["inBody"]
break
self.phase = new_phase
def parseRCDataRawtext(self, token, contentType):
# Generic RCDATA/RAWTEXT Parsing algorithm
assert contentType in ("RAWTEXT", "RCDATA")
self.tree.insertElement(token)
if contentType == "RAWTEXT":
self.tokenizer.state = self.tokenizer.rawtextState
else:
self.tokenizer.state = self.tokenizer.rcdataState
self.originalPhase = self.phase
self.phase = self.phases["text"]
@_utils.memoize
def getPhases(debug):
def log(function):
"""Logger that records which phase processes each token"""
type_names = {value: key for key, value in tokenTypes.items()}
def wrapped(self, *args, **kwargs):
if function.__name__.startswith("process") and len(args) > 0:
token = args[0]
info = {"type": type_names[token['type']]}
if token['type'] in tagTokenTypes:
info["name"] = token['name']
self.parser.log.append((self.parser.tokenizer.state.__name__,
self.parser.phase.__class__.__name__,
self.__class__.__name__,
function.__name__,
info))
return function(self, *args, **kwargs)
else:
return function(self, *args, **kwargs)
return wrapped
def getMetaclass(use_metaclass, metaclass_func):
if use_metaclass:
return method_decorator_metaclass(metaclass_func)
else:
return type
# pylint:disable=unused-argument
class Phase(with_metaclass(getMetaclass(debug, log))):
"""Base class for helper object that implements each phase of processing
"""
__slots__ = ("parser", "tree", "__startTagCache", "__endTagCache")
def __init__(self, parser, tree):
self.parser = parser
self.tree = tree
self.__startTagCache = {}
self.__endTagCache = {}
def processEOF(self):
raise NotImplementedError
def processComment(self, token):
# For most phases the following is correct. Where it's not it will be
# overridden.
self.tree.insertComment(token, self.tree.openElements[-1])
def processDoctype(self, token):
self.parser.parseError("unexpected-doctype")
def processCharacters(self, token):
self.tree.insertText(token["data"])
def processSpaceCharacters(self, token):
self.tree.insertText(token["data"])
def processStartTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__startTagCache:
func = self.__startTagCache[name]
else:
func = self.__startTagCache[name] = self.startTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__startTagCache) > len(self.startTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__startTagCache.pop(next(iter(self.__startTagCache)))
return func(token)
def startTagHtml(self, token):
if not self.parser.firstStartTag and token["name"] == "html":
self.parser.parseError("non-html-root")
# XXX Need a check here to see if the first start tag token emitted is
# this token... If it's not, invoke self.parser.parseError().
for attr, value in token["data"].items():
if attr not in self.tree.openElements[0].attributes:
self.tree.openElements[0].attributes[attr] = value
self.parser.firstStartTag = False
def processEndTag(self, token):
# Note the caching is done here rather than BoundMethodDispatcher as doing it there
# requires a circular reference to the Phase, and this ends up with a significant
# (CPython 2.7, 3.8) GC cost when parsing many short inputs
name = token["name"]
# In Py2, using `in` is quicker in general than try/except KeyError
# In Py3, `in` is quicker when there are few cache hits (typically short inputs)
if name in self.__endTagCache:
func = self.__endTagCache[name]
else:
func = self.__endTagCache[name] = self.endTagHandler[name]
# bound the cache size in case we get loads of unknown tags
while len(self.__endTagCache) > len(self.endTagHandler) * 1.1:
# this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7
self.__endTagCache.pop(next(iter(self.__endTagCache)))
return func(token)
class InitialPhase(Phase):
__slots__ = tuple()
def processSpaceCharacters(self, token):
pass
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processDoctype(self, token):
name = token["name"]
publicId = token["publicId"]
systemId = token["systemId"]
correct = token["correct"]
if (name != "html" or publicId is not None or
systemId is not None and systemId != "about:legacy-compat"):
self.parser.parseError("unknown-doctype")
if publicId is None:
publicId = ""
self.tree.insertDoctype(token)
if publicId != "":
publicId = publicId.translate(asciiUpper2Lower)
if (not correct or token["name"] != "html" or
publicId.startswith(
("+//silmaril//dtd html pro v0r11 19970101//",
"-//advasoft ltd//dtd html 3.0 aswedit + extensions//",
"-//as//dtd html 3.0 aswedit + extensions//",
"-//ietf//dtd html 2.0 level 1//",
"-//ietf//dtd html 2.0 level 2//",
"-//ietf//dtd html 2.0 strict level 1//",
"-//ietf//dtd html 2.0 strict level 2//",
"-//ietf//dtd html 2.0 strict//",
"-//ietf//dtd html 2.0//",
"-//ietf//dtd html 2.1e//",
"-//ietf//dtd html 3.0//",
"-//ietf//dtd html 3.2 final//",
"-//ietf//dtd html 3.2//",
"-//ietf//dtd html 3//",
"-//ietf//dtd html level 0//",
"-//ietf//dtd html level 1//",
"-//ietf//dtd html level 2//",
"-//ietf//dtd html level 3//",
"-//ietf//dtd html strict level 0//",
"-//ietf//dtd html strict level 1//",
"-//ietf//dtd html strict level 2//",
"-//ietf//dtd html strict level 3//",
"-//ietf//dtd html strict//",
"-//ietf//dtd html//",
"-//metrius//dtd metrius presentational//",
"-//microsoft//dtd internet explorer 2.0 html strict//",
"-//microsoft//dtd internet explorer 2.0 html//",
"-//microsoft//dtd internet explorer 2.0 tables//",
"-//microsoft//dtd internet explorer 3.0 html strict//",
"-//microsoft//dtd internet explorer 3.0 html//",
"-//microsoft//dtd internet explorer 3.0 tables//",
"-//netscape comm. corp.//dtd html//",
"-//netscape comm. corp.//dtd strict html//",
"-//o'reilly and associates//dtd html 2.0//",
"-//o'reilly and associates//dtd html extended 1.0//",
"-//o'reilly and associates//dtd html extended relaxed 1.0//",
"-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//",
"-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//",
"-//spyglass//dtd html 2.0 extended//",
"-//sq//dtd html 2.0 hotmetal + extensions//",
"-//sun microsystems corp.//dtd hotjava html//",
"-//sun microsystems corp.//dtd hotjava strict html//",
"-//w3c//dtd html 3 1995-03-24//",
"-//w3c//dtd html 3.2 draft//",
"-//w3c//dtd html 3.2 final//",
"-//w3c//dtd html 3.2//",
"-//w3c//dtd html 3.2s draft//",
"-//w3c//dtd html 4.0 frameset//",
"-//w3c//dtd html 4.0 transitional//",
"-//w3c//dtd html experimental 19960712//",
"-//w3c//dtd html experimental 970421//",
"-//w3c//dtd w3 html//",
"-//w3o//dtd w3 html 3.0//",
"-//webtechs//dtd mozilla html 2.0//",
"-//webtechs//dtd mozilla html//")) or
publicId in ("-//w3o//dtd w3 html strict 3.0//en//",
"-/w3c/dtd html 4.0 transitional/en",
"html") or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is None or
systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"):
self.parser.compatMode = "quirks"
elif (publicId.startswith(
("-//w3c//dtd xhtml 1.0 frameset//",
"-//w3c//dtd xhtml 1.0 transitional//")) or
publicId.startswith(
("-//w3c//dtd html 4.01 frameset//",
"-//w3c//dtd html 4.01 transitional//")) and
systemId is not None):
self.parser.compatMode = "limited quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def anythingElse(self):
self.parser.compatMode = "quirks"
self.parser.phase = self.parser.phases["beforeHtml"]
def processCharacters(self, token):
self.parser.parseError("expected-doctype-but-got-chars")
self.anythingElse()
return token
def processStartTag(self, token):
self.parser.parseError("expected-doctype-but-got-start-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEndTag(self, token):
self.parser.parseError("expected-doctype-but-got-end-tag",
{"name": token["name"]})
self.anythingElse()
return token
def processEOF(self):
self.parser.parseError("expected-doctype-but-got-eof")
self.anythingElse()
return True
class BeforeHtmlPhase(Phase):
__slots__ = tuple()
# helper methods
def insertHtmlElement(self):
self.tree.insertRoot(impliedTagToken("html", "StartTag"))
self.parser.phase = self.parser.phases["beforeHead"]
# other
def processEOF(self):
self.insertHtmlElement()
return True
def processComment(self, token):
self.tree.insertComment(token, self.tree.document)
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.insertHtmlElement()
return token
def processStartTag(self, token):
if token["name"] == "html":
self.parser.firstStartTag = True
self.insertHtmlElement()
return token
def processEndTag(self, token):
if token["name"] not in ("head", "body", "html", "br"):
self.parser.parseError("unexpected-end-tag-before-html",
{"name": token["name"]})
else:
self.insertHtmlElement()
return token
class BeforeHeadPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.startTagHead(impliedTagToken("head", "StartTag"))
return True
def processSpaceCharacters(self, token):
pass
def processCharacters(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.tree.insertElement(token)
self.tree.headPointer = self.tree.openElements[-1]
self.parser.phase = self.parser.phases["inHead"]
def startTagOther(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagImplyHead(self, token):
self.startTagHead(impliedTagToken("head", "StartTag"))
return token
def endTagOther(self, token):
self.parser.parseError("end-tag-after-implied-root",
{"name": token["name"]})
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
(("head", "body", "html", "br"), endTagImplyHead)
])
endTagHandler.default = endTagOther
class InHeadPhase(Phase):
__slots__ = tuple()
# the real thing
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagHead(self, token):
self.parser.parseError("two-heads-are-not-better-than-one")
def startTagBaseLinkCommand(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
def startTagMeta(self, token):
self.tree.insertElement(token)
self.tree.openElements.pop()
token["selfClosingAcknowledged"] = True
attributes = token["data"]
if self.parser.tokenizer.stream.charEncoding[1] == "tentative":
if "charset" in attributes:
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
elif ("content" in attributes and
"http-equiv" in attributes and
attributes["http-equiv"].lower() == "content-type"):
# Encoding it as UTF-8 here is a hack, as really we should pass
# the abstract Unicode string, and just use the
# ContentAttrParser on that, but using UTF-8 allows all chars
# to be encoded and as a ASCII-superset works.
data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8"))
parser = _inputstream.ContentAttrParser(data)
codec = parser.parse()
self.parser.tokenizer.stream.changeEncoding(codec)
def startTagTitle(self, token):
self.parser.parseRCDataRawtext(token, "RCDATA")
def startTagNoFramesStyle(self, token):
# Need to decide whether to implement the scripting-disabled case
self.parser.parseRCDataRawtext(token, "RAWTEXT")
def startTagNoscript(self, token):
if self.parser.scripting:
self.parser.parseRCDataRawtext(token, "RAWTEXT")
else:
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inHeadNoscript"]
def startTagScript(self, token):
self.tree.insertElement(token)
self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState
self.parser.originalPhase = self.parser.phase
self.parser.phase = self.parser.phases["text"]
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHead(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "head", "Expected head got %s" % node.name
self.parser.phase = self.parser.phases["afterHead"]
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.endTagHead(impliedTagToken("head"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("title", startTagTitle),
(("noframes", "style"), startTagNoFramesStyle),
("noscript", startTagNoscript),
("script", startTagScript),
(("base", "basefont", "bgsound", "command", "link"),
startTagBaseLinkCommand),
("meta", startTagMeta),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("head", endTagHead),
(("br", "html", "body"), endTagHtmlBodyBr)
])
endTagHandler.default = endTagOther
class InHeadNoscriptPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.parser.parseError("eof-in-head-noscript")
self.anythingElse()
return True
def processComment(self, token):
return self.parser.phases["inHead"].processComment(token)
def processCharacters(self, token):
self.parser.parseError("char-in-head-noscript")
self.anythingElse()
return token
def processSpaceCharacters(self, token):
return self.parser.phases["inHead"].processSpaceCharacters(token)
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBaseLinkCommand(self, token):
return self.parser.phases["inHead"].processStartTag(token)
def startTagHeadNoscript(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagNoscript(self, token):
node = self.parser.tree.openElements.pop()
assert node.name == "noscript", "Expected noscript got %s" % node.name
self.parser.phase = self.parser.phases["inHead"]
def endTagBr(self, token):
self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]})
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
# Caller must raise parse error first!
self.endTagNoscript(impliedTagToken("noscript"))
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
(("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand),
(("head", "noscript"), startTagHeadNoscript),
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([
("noscript", endTagNoscript),
("br", endTagBr),
])
endTagHandler.default = endTagOther
class AfterHeadPhase(Phase):
__slots__ = tuple()
def processEOF(self):
self.anythingElse()
return True
def processCharacters(self, token):
self.anythingElse()
return token
def startTagHtml(self, token):
return self.parser.phases["inBody"].processStartTag(token)
def startTagBody(self, token):
self.parser.framesetOK = False
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inBody"]
def startTagFrameset(self, token):
self.tree.insertElement(token)
self.parser.phase = self.parser.phases["inFrameset"]
def startTagFromHead(self, token):
self.parser.parseError("unexpected-start-tag-out-of-my-head",
{"name": token["name"]})
self.tree.openElements.append(self.tree.headPointer)
self.parser.phases["inHead"].processStartTag(token)
for node in self.tree.openElements[::-1]:
if node.name == "head":
self.tree.openElements.remove(node)
break
def startTagHead(self, token):
self.parser.parseError("unexpected-start-tag", {"name": token["name"]})
def startTagOther(self, token):
self.anythingElse()
return token
def endTagHtmlBodyBr(self, token):
self.anythingElse()
return token
def endTagOther(self, token):
self.parser.parseError("unexpected-end-tag", {"name": token["name"]})
def anythingElse(self):
self.tree.insertElement(impliedTagToken("body", "StartTag"))
self.parser.phase = self.parser.phases["inBody"]
self.parser.framesetOK = True
startTagHandler = _utils.MethodDispatcher([
("html", startTagHtml),
("body", startTagBody),
("frameset", startTagFrameset),
(("base", "basefont", "bgsound", "link", "meta", "noframes", "script",
"style", "title"),
startTagFromHead),
("head", startTagHead)
])
startTagHandler.default = startTagOther
endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"),
endTagHtmlBodyBr)])
endTagHandler.default = endTagOther
class InBodyPhase(Phase):
# http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody
# the really-really-really-very crazy mode
__slots__ = ("processSpaceCharacters",)
def __init__(self, *args, **kwargs):
super(InBodyPhase, self).__init__(*args, **kwargs)
# Set this to the default handler
self.processSpaceCharacters = self.processSpaceCharactersNonPre
def isMatchingFormattingElement(self, node1, node2):
return (node1.name == node2.name and
node1.namespace == node2.namespace and
node1.attributes == node2.attributes)
# helper
def addFormattingElement(self, token):
self.tree.insertElement(token)
element = self.tree.openElements[-1]
matchingElements = []
for node in self.tree.activeFormattingElements[::-1]:
if node is Marker:
break
elif self.isMatchingFormattingElement(node, element):
matchingElements.append(node)
assert len(matchingElements) <= 3
if len(matchingElements) == 3:
self.tree.activeFormattingElements.remove(matchingElements[-1])
self.tree.activeFormattingElements.append(element)
# the real deal
def processEOF(self):
allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td",
"tfoot", "th", "thead", "tr", "body",
"html"))
for node in self.tree.openElements[::-1]:
if node.name not in allowed_elements:
self.parser.parseError("expected-closing-tag-but-got-eof")
break
# Stop parsing
def processSpaceCharactersDropNewline(self, token):
# Sometimes (start of , , and