diff --git a/html5lib/_utils.py b/html5lib/_utils.py index 9ea57942..7e23ee57 100644 --- a/html5lib/_utils.py +++ b/html5lib/_utils.py @@ -145,15 +145,3 @@ def moduleFactory(baseModule, *args, **kwargs): return mod return moduleFactory - - -def memoize(func): - cache = {} - - def wrapped(*args, **kwargs): - key = (tuple(args), tuple(kwargs.items())) - if key not in cache: - cache[key] = func(*args, **kwargs) - return cache[key] - - return wrapped diff --git a/html5lib/html5parser.py b/html5lib/html5parser.py index 4c2d4c75..b3c206d1 100644 --- a/html5lib/html5parser.py +++ b/html5lib/html5parser.py @@ -1,7 +1,5 @@ from __future__ import absolute_import, division, unicode_literals -from six import with_metaclass, viewkeys - -import types +from six import viewkeys from . import _inputstream from . import _tokenizer @@ -13,7 +11,7 @@ from .constants import ( spaceCharacters, asciiUpper2Lower, specialElements, headingElements, cdataElements, rcdataElements, - tokenTypes, tagTokenTypes, + tokenTypes, namespaces, htmlIntegrationPointElements, mathmlTextIntegrationPointElements, adjustForeignAttributes as adjustForeignAttributesMap, @@ -71,18 +69,6 @@ def parseFragment(doc, container="div", treebuilder="etree", namespaceHTMLElemen return p.parseFragment(doc, container=container, **kwargs) -def method_decorator_metaclass(function): - class Decorated(type): - def __new__(meta, classname, bases, classDict): - for attributeName, attribute in classDict.items(): - if isinstance(attribute, types.FunctionType): - attribute = function(attribute) - - classDict[attributeName] = attribute - return type.__new__(meta, classname, bases, classDict) - return Decorated - - class HTMLParser(object): """HTML parser @@ -112,6 +98,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa # Raise an exception on the first error encountered self.strict = strict + self.debug = debug if tree is None: tree = treebuilders.getTreeBuilder("etree") @@ -122,7 +109,7 @@ def __init__(self, tree=None, strict=False, namespaceHTMLElements=True, debug=Fa self.errors = [] self.phases = {name: cls(self, self.tree) for name, cls in - getPhases(debug).items()} + _phases.items()} def _parse(self, stream, innerHTML=False, container="div", scripting=False, **kwargs): @@ -204,6 +191,9 @@ def mainLoop(self): DoctypeToken = tokenTypes["Doctype"] ParseErrorToken = tokenTypes["ParseError"] + type_names = {value: key for key, value in tokenTypes.items()} + debug = self.debug + for token in self.tokenizer: prev_token = None new_token = token @@ -235,6 +225,17 @@ def mainLoop(self): else: phase = self.phases["inForeignContent"] + if debug: + info = {"type": type_names[type]} + if type in (StartTagToken, EndTagToken): + info["name"] = new_token['name'] + + self.log.append((self.tokenizer.state.__name__, + self.phase.__class__.__name__, + phase.__class__.__name__, + "process" + info["type"], + info)) + if type == CharactersToken: new_token = phase.processCharacters(new_token) elif type == SpaceCharactersToken: @@ -396,2386 +397,2380 @@ def parseRCDataRawtext(self, token, contentType): self.phase = self.phases["text"] -@_utils.memoize -def getPhases(debug): - def log(function): - """Logger that records which phase processes each token""" - type_names = {value: key for key, value in tokenTypes.items()} - - def wrapped(self, *args, **kwargs): - if function.__name__.startswith("process") and len(args) > 0: - token = args[0] - info = {"type": type_names[token['type']]} - if token['type'] in tagTokenTypes: - info["name"] = token['name'] - - self.parser.log.append((self.parser.tokenizer.state.__name__, - self.parser.phase.__class__.__name__, - self.__class__.__name__, - function.__name__, - info)) - return function(self, *args, **kwargs) - else: - return function(self, *args, **kwargs) - return wrapped - - def getMetaclass(use_metaclass, metaclass_func): - if use_metaclass: - return method_decorator_metaclass(metaclass_func) +class Phase(object): + """Base class for helper object that implements each phase of processing + """ + __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") + + def __init__(self, parser, tree): + self.parser = parser + self.tree = tree + self.__startTagCache = {} + self.__endTagCache = {} + + def processEOF(self): + raise NotImplementedError + + def processComment(self, token): + # For most phases the following is correct. Where it's not it will be + # overridden. + self.tree.insertComment(token, self.tree.openElements[-1]) + + def processDoctype(self, token): + self.parser.parseError("unexpected-doctype") + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processSpaceCharacters(self, token): + self.tree.insertText(token["data"]) + + def processStartTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__startTagCache: + func = self.__startTagCache[name] else: - return type - - # pylint:disable=unused-argument - class Phase(with_metaclass(getMetaclass(debug, log))): - """Base class for helper object that implements each phase of processing - """ - __slots__ = ("parser", "tree", "__startTagCache", "__endTagCache") - - def __init__(self, parser, tree): - self.parser = parser - self.tree = tree - self.__startTagCache = {} - self.__endTagCache = {} - - def processEOF(self): - raise NotImplementedError - - def processComment(self, token): - # For most phases the following is correct. Where it's not it will be - # overridden. - self.tree.insertComment(token, self.tree.openElements[-1]) - - def processDoctype(self, token): - self.parser.parseError("unexpected-doctype") - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processSpaceCharacters(self, token): - self.tree.insertText(token["data"]) - - def processStartTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__startTagCache: - func = self.__startTagCache[name] - else: - func = self.__startTagCache[name] = self.startTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__startTagCache.pop(next(iter(self.__startTagCache))) - return func(token) - - def startTagHtml(self, token): - if not self.parser.firstStartTag and token["name"] == "html": - self.parser.parseError("non-html-root") - # XXX Need a check here to see if the first start tag token emitted is - # this token... If it's not, invoke self.parser.parseError(). - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[0].attributes: - self.tree.openElements[0].attributes[attr] = value - self.parser.firstStartTag = False - - def processEndTag(self, token): - # Note the caching is done here rather than BoundMethodDispatcher as doing it there - # requires a circular reference to the Phase, and this ends up with a significant - # (CPython 2.7, 3.8) GC cost when parsing many short inputs - name = token["name"] - # In Py2, using `in` is quicker in general than try/except KeyError - # In Py3, `in` is quicker when there are few cache hits (typically short inputs) - if name in self.__endTagCache: - func = self.__endTagCache[name] - else: - func = self.__endTagCache[name] = self.endTagHandler[name] - # bound the cache size in case we get loads of unknown tags - while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: - # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 - self.__endTagCache.pop(next(iter(self.__endTagCache))) - return func(token) - - class InitialPhase(Phase): - __slots__ = tuple() - - def processSpaceCharacters(self, token): - pass - - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) - - def processDoctype(self, token): - name = token["name"] - publicId = token["publicId"] - systemId = token["systemId"] - correct = token["correct"] - - if (name != "html" or publicId is not None or - systemId is not None and systemId != "about:legacy-compat"): - self.parser.parseError("unknown-doctype") - - if publicId is None: - publicId = "" - - self.tree.insertDoctype(token) - - if publicId != "": - publicId = publicId.translate(asciiUpper2Lower) - - if (not correct or token["name"] != "html" or - publicId.startswith( - ("+//silmaril//dtd html pro v0r11 19970101//", - "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", - "-//as//dtd html 3.0 aswedit + extensions//", - "-//ietf//dtd html 2.0 level 1//", - "-//ietf//dtd html 2.0 level 2//", - "-//ietf//dtd html 2.0 strict level 1//", - "-//ietf//dtd html 2.0 strict level 2//", - "-//ietf//dtd html 2.0 strict//", - "-//ietf//dtd html 2.0//", - "-//ietf//dtd html 2.1e//", - "-//ietf//dtd html 3.0//", - "-//ietf//dtd html 3.2 final//", - "-//ietf//dtd html 3.2//", - "-//ietf//dtd html 3//", - "-//ietf//dtd html level 0//", - "-//ietf//dtd html level 1//", - "-//ietf//dtd html level 2//", - "-//ietf//dtd html level 3//", - "-//ietf//dtd html strict level 0//", - "-//ietf//dtd html strict level 1//", - "-//ietf//dtd html strict level 2//", - "-//ietf//dtd html strict level 3//", - "-//ietf//dtd html strict//", - "-//ietf//dtd html//", - "-//metrius//dtd metrius presentational//", - "-//microsoft//dtd internet explorer 2.0 html strict//", - "-//microsoft//dtd internet explorer 2.0 html//", - "-//microsoft//dtd internet explorer 2.0 tables//", - "-//microsoft//dtd internet explorer 3.0 html strict//", - "-//microsoft//dtd internet explorer 3.0 html//", - "-//microsoft//dtd internet explorer 3.0 tables//", - "-//netscape comm. corp.//dtd html//", - "-//netscape comm. corp.//dtd strict html//", - "-//o'reilly and associates//dtd html 2.0//", - "-//o'reilly and associates//dtd html extended 1.0//", - "-//o'reilly and associates//dtd html extended relaxed 1.0//", - "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", - "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", - "-//spyglass//dtd html 2.0 extended//", - "-//sq//dtd html 2.0 hotmetal + extensions//", - "-//sun microsystems corp.//dtd hotjava html//", - "-//sun microsystems corp.//dtd hotjava strict html//", - "-//w3c//dtd html 3 1995-03-24//", - "-//w3c//dtd html 3.2 draft//", - "-//w3c//dtd html 3.2 final//", - "-//w3c//dtd html 3.2//", - "-//w3c//dtd html 3.2s draft//", - "-//w3c//dtd html 4.0 frameset//", - "-//w3c//dtd html 4.0 transitional//", - "-//w3c//dtd html experimental 19960712//", - "-//w3c//dtd html experimental 970421//", - "-//w3c//dtd w3 html//", - "-//w3o//dtd w3 html 3.0//", - "-//webtechs//dtd mozilla html 2.0//", - "-//webtechs//dtd mozilla html//")) or - publicId in ("-//w3o//dtd w3 html strict 3.0//en//", - "-/w3c/dtd html 4.0 transitional/en", - "html") or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is None or - systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): - self.parser.compatMode = "quirks" - elif (publicId.startswith( - ("-//w3c//dtd xhtml 1.0 frameset//", - "-//w3c//dtd xhtml 1.0 transitional//")) or - publicId.startswith( - ("-//w3c//dtd html 4.01 frameset//", - "-//w3c//dtd html 4.01 transitional//")) and - systemId is not None): - self.parser.compatMode = "limited quirks" - - self.parser.phase = self.parser.phases["beforeHtml"] - - def anythingElse(self): + func = self.__startTagCache[name] = self.startTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__startTagCache) > len(self.startTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__startTagCache.pop(next(iter(self.__startTagCache))) + return func(token) + + def startTagHtml(self, token): + if not self.parser.firstStartTag and token["name"] == "html": + self.parser.parseError("non-html-root") + # XXX Need a check here to see if the first start tag token emitted is + # this token... If it's not, invoke self.parser.parseError(). + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[0].attributes: + self.tree.openElements[0].attributes[attr] = value + self.parser.firstStartTag = False + + def processEndTag(self, token): + # Note the caching is done here rather than BoundMethodDispatcher as doing it there + # requires a circular reference to the Phase, and this ends up with a significant + # (CPython 2.7, 3.8) GC cost when parsing many short inputs + name = token["name"] + # In Py2, using `in` is quicker in general than try/except KeyError + # In Py3, `in` is quicker when there are few cache hits (typically short inputs) + if name in self.__endTagCache: + func = self.__endTagCache[name] + else: + func = self.__endTagCache[name] = self.endTagHandler[name] + # bound the cache size in case we get loads of unknown tags + while len(self.__endTagCache) > len(self.endTagHandler) * 1.1: + # this makes the eviction policy random on Py < 3.7 and FIFO >= 3.7 + self.__endTagCache.pop(next(iter(self.__endTagCache))) + return func(token) + + +class InitialPhase(Phase): + __slots__ = tuple() + + def processSpaceCharacters(self, token): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processDoctype(self, token): + name = token["name"] + publicId = token["publicId"] + systemId = token["systemId"] + correct = token["correct"] + + if (name != "html" or publicId is not None or + systemId is not None and systemId != "about:legacy-compat"): + self.parser.parseError("unknown-doctype") + + if publicId is None: + publicId = "" + + self.tree.insertDoctype(token) + + if publicId != "": + publicId = publicId.translate(asciiUpper2Lower) + + if (not correct or token["name"] != "html" or + publicId.startswith( + ("+//silmaril//dtd html pro v0r11 19970101//", + "-//advasoft ltd//dtd html 3.0 aswedit + extensions//", + "-//as//dtd html 3.0 aswedit + extensions//", + "-//ietf//dtd html 2.0 level 1//", + "-//ietf//dtd html 2.0 level 2//", + "-//ietf//dtd html 2.0 strict level 1//", + "-//ietf//dtd html 2.0 strict level 2//", + "-//ietf//dtd html 2.0 strict//", + "-//ietf//dtd html 2.0//", + "-//ietf//dtd html 2.1e//", + "-//ietf//dtd html 3.0//", + "-//ietf//dtd html 3.2 final//", + "-//ietf//dtd html 3.2//", + "-//ietf//dtd html 3//", + "-//ietf//dtd html level 0//", + "-//ietf//dtd html level 1//", + "-//ietf//dtd html level 2//", + "-//ietf//dtd html level 3//", + "-//ietf//dtd html strict level 0//", + "-//ietf//dtd html strict level 1//", + "-//ietf//dtd html strict level 2//", + "-//ietf//dtd html strict level 3//", + "-//ietf//dtd html strict//", + "-//ietf//dtd html//", + "-//metrius//dtd metrius presentational//", + "-//microsoft//dtd internet explorer 2.0 html strict//", + "-//microsoft//dtd internet explorer 2.0 html//", + "-//microsoft//dtd internet explorer 2.0 tables//", + "-//microsoft//dtd internet explorer 3.0 html strict//", + "-//microsoft//dtd internet explorer 3.0 html//", + "-//microsoft//dtd internet explorer 3.0 tables//", + "-//netscape comm. corp.//dtd html//", + "-//netscape comm. corp.//dtd strict html//", + "-//o'reilly and associates//dtd html 2.0//", + "-//o'reilly and associates//dtd html extended 1.0//", + "-//o'reilly and associates//dtd html extended relaxed 1.0//", + "-//softquad software//dtd hotmetal pro 6.0::19990601::extensions to html 4.0//", + "-//softquad//dtd hotmetal pro 4.0::19971010::extensions to html 4.0//", + "-//spyglass//dtd html 2.0 extended//", + "-//sq//dtd html 2.0 hotmetal + extensions//", + "-//sun microsystems corp.//dtd hotjava html//", + "-//sun microsystems corp.//dtd hotjava strict html//", + "-//w3c//dtd html 3 1995-03-24//", + "-//w3c//dtd html 3.2 draft//", + "-//w3c//dtd html 3.2 final//", + "-//w3c//dtd html 3.2//", + "-//w3c//dtd html 3.2s draft//", + "-//w3c//dtd html 4.0 frameset//", + "-//w3c//dtd html 4.0 transitional//", + "-//w3c//dtd html experimental 19960712//", + "-//w3c//dtd html experimental 970421//", + "-//w3c//dtd w3 html//", + "-//w3o//dtd w3 html 3.0//", + "-//webtechs//dtd mozilla html 2.0//", + "-//webtechs//dtd mozilla html//")) or + publicId in ("-//w3o//dtd w3 html strict 3.0//en//", + "-/w3c/dtd html 4.0 transitional/en", + "html") or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is None or + systemId and systemId.lower() == "http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd"): self.parser.compatMode = "quirks" - self.parser.phase = self.parser.phases["beforeHtml"] - - def processCharacters(self, token): - self.parser.parseError("expected-doctype-but-got-chars") - self.anythingElse() - return token - - def processStartTag(self, token): - self.parser.parseError("expected-doctype-but-got-start-tag", + elif (publicId.startswith( + ("-//w3c//dtd xhtml 1.0 frameset//", + "-//w3c//dtd xhtml 1.0 transitional//")) or + publicId.startswith( + ("-//w3c//dtd html 4.01 frameset//", + "-//w3c//dtd html 4.01 transitional//")) and + systemId is not None): + self.parser.compatMode = "limited quirks" + + self.parser.phase = self.parser.phases["beforeHtml"] + + def anythingElse(self): + self.parser.compatMode = "quirks" + self.parser.phase = self.parser.phases["beforeHtml"] + + def processCharacters(self, token): + self.parser.parseError("expected-doctype-but-got-chars") + self.anythingElse() + return token + + def processStartTag(self, token): + self.parser.parseError("expected-doctype-but-got-start-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEndTag(self, token): + self.parser.parseError("expected-doctype-but-got-end-tag", + {"name": token["name"]}) + self.anythingElse() + return token + + def processEOF(self): + self.parser.parseError("expected-doctype-but-got-eof") + self.anythingElse() + return True + + +class BeforeHtmlPhase(Phase): + __slots__ = tuple() + + # helper methods + def insertHtmlElement(self): + self.tree.insertRoot(impliedTagToken("html", "StartTag")) + self.parser.phase = self.parser.phases["beforeHead"] + + # other + def processEOF(self): + self.insertHtmlElement() + return True + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + pass + + def processCharacters(self, token): + self.insertHtmlElement() + return token + + def processStartTag(self, token): + if token["name"] == "html": + self.parser.firstStartTag = True + self.insertHtmlElement() + return token + + def processEndTag(self, token): + if token["name"] not in ("head", "body", "html", "br"): + self.parser.parseError("unexpected-end-tag-before-html", {"name": token["name"]}) - self.anythingElse() + else: + self.insertHtmlElement() return token - def processEndTag(self, token): - self.parser.parseError("expected-doctype-but-got-end-tag", - {"name": token["name"]}) - self.anythingElse() - return token - def processEOF(self): - self.parser.parseError("expected-doctype-but-got-eof") - self.anythingElse() - return True +class BeforeHeadPhase(Phase): + __slots__ = tuple() - class BeforeHtmlPhase(Phase): - __slots__ = tuple() + def processEOF(self): + self.startTagHead(impliedTagToken("head", "StartTag")) + return True - # helper methods - def insertHtmlElement(self): - self.tree.insertRoot(impliedTagToken("html", "StartTag")) - self.parser.phase = self.parser.phases["beforeHead"] + def processSpaceCharacters(self, token): + pass - # other - def processEOF(self): - self.insertHtmlElement() - return True + def processCharacters(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def processSpaceCharacters(self, token): - pass + def startTagHead(self, token): + self.tree.insertElement(token) + self.tree.headPointer = self.tree.openElements[-1] + self.parser.phase = self.parser.phases["inHead"] - def processCharacters(self, token): - self.insertHtmlElement() - return token + def startTagOther(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processStartTag(self, token): - if token["name"] == "html": - self.parser.firstStartTag = True - self.insertHtmlElement() - return token + def endTagImplyHead(self, token): + self.startTagHead(impliedTagToken("head", "StartTag")) + return token - def processEndTag(self, token): - if token["name"] not in ("head", "body", "html", "br"): - self.parser.parseError("unexpected-end-tag-before-html", - {"name": token["name"]}) - else: - self.insertHtmlElement() - return token + def endTagOther(self, token): + self.parser.parseError("end-tag-after-implied-root", + {"name": token["name"]}) - class BeforeHeadPhase(Phase): - __slots__ = tuple() + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther - def processEOF(self): - self.startTagHead(impliedTagToken("head", "StartTag")) - return True + endTagHandler = _utils.MethodDispatcher([ + (("head", "body", "html", "br"), endTagImplyHead) + ]) + endTagHandler.default = endTagOther - def processSpaceCharacters(self, token): - pass - def processCharacters(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token +class InHeadPhase(Phase): + __slots__ = tuple() - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + # the real thing + def processEOF(self): + self.anythingElse() + return True - def startTagHead(self, token): - self.tree.insertElement(token) - self.tree.headPointer = self.tree.openElements[-1] - self.parser.phase = self.parser.phases["inHead"] + def processCharacters(self, token): + self.anythingElse() + return token - def startTagOther(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def endTagImplyHead(self, token): - self.startTagHead(impliedTagToken("head", "StartTag")) - return token + def startTagHead(self, token): + self.parser.parseError("two-heads-are-not-better-than-one") - def endTagOther(self, token): - self.parser.parseError("end-tag-after-implied-root", - {"name": token["name"]}) + def startTagBaseLinkCommand(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("head", startTagHead) - ]) - startTagHandler.default = startTagOther + def startTagMeta(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + + attributes = token["data"] + if self.parser.tokenizer.stream.charEncoding[1] == "tentative": + if "charset" in attributes: + self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) + elif ("content" in attributes and + "http-equiv" in attributes and + attributes["http-equiv"].lower() == "content-type"): + # Encoding it as UTF-8 here is a hack, as really we should pass + # the abstract Unicode string, and just use the + # ContentAttrParser on that, but using UTF-8 allows all chars + # to be encoded and as a ASCII-superset works. + data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) + parser = _inputstream.ContentAttrParser(data) + codec = parser.parse() + self.parser.tokenizer.stream.changeEncoding(codec) + + def startTagTitle(self, token): + self.parser.parseRCDataRawtext(token, "RCDATA") + + def startTagNoFramesStyle(self, token): + # Need to decide whether to implement the scripting-disabled case + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagNoscript(self, token): + if self.parser.scripting: + self.parser.parseRCDataRawtext(token, "RAWTEXT") + else: + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inHeadNoscript"] - endTagHandler = _utils.MethodDispatcher([ - (("head", "body", "html", "br"), endTagImplyHead) - ]) - endTagHandler.default = endTagOther + def startTagScript(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState + self.parser.originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["text"] + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHead(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "head", "Expected head got %s" % node.name + self.parser.phase = self.parser.phases["afterHead"] + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.endTagHead(impliedTagToken("head")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("title", startTagTitle), + (("noframes", "style"), startTagNoFramesStyle), + ("noscript", startTagNoscript), + ("script", startTagScript), + (("base", "basefont", "bgsound", "command", "link"), + startTagBaseLinkCommand), + ("meta", startTagMeta), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("head", endTagHead), + (("br", "html", "body"), endTagHtmlBodyBr) + ]) + endTagHandler.default = endTagOther + + +class InHeadNoscriptPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.parser.parseError("eof-in-head-noscript") + self.anythingElse() + return True + + def processComment(self, token): + return self.parser.phases["inHead"].processComment(token) + + def processCharacters(self, token): + self.parser.parseError("char-in-head-noscript") + self.anythingElse() + return token + + def processSpaceCharacters(self, token): + return self.parser.phases["inHead"].processSpaceCharacters(token) + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBaseLinkCommand(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagHeadNoscript(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagNoscript(self, token): + node = self.parser.tree.openElements.pop() + assert node.name == "noscript", "Expected noscript got %s" % node.name + self.parser.phase = self.parser.phases["inHead"] + + def endTagBr(self, token): + self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + # Caller must raise parse error first! + self.endTagNoscript(impliedTagToken("noscript")) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), + (("head", "noscript"), startTagHeadNoscript), + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("noscript", endTagNoscript), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + + +class AfterHeadPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + self.anythingElse() + return True + + def processCharacters(self, token): + self.anythingElse() + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagBody(self, token): + self.parser.framesetOK = False + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inBody"] - class InHeadPhase(Phase): - __slots__ = tuple() + def startTagFrameset(self, token): + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inFrameset"] - # the real thing - def processEOF(self): - self.anythingElse() - return True + def startTagFromHead(self, token): + self.parser.parseError("unexpected-start-tag-out-of-my-head", + {"name": token["name"]}) + self.tree.openElements.append(self.tree.headPointer) + self.parser.phases["inHead"].processStartTag(token) + for node in self.tree.openElements[::-1]: + if node.name == "head": + self.tree.openElements.remove(node) + break - def processCharacters(self, token): - self.anythingElse() - return token + def startTagHead(self, token): + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + + def startTagOther(self, token): + self.anythingElse() + return token + + def endTagHtmlBodyBr(self, token): + self.anythingElse() + return token + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def anythingElse(self): + self.tree.insertElement(impliedTagToken("body", "StartTag")) + self.parser.phase = self.parser.phases["inBody"] + self.parser.framesetOK = True + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", + "style", "title"), + startTagFromHead), + ("head", startTagHead) + ]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), + endTagHtmlBodyBr)]) + endTagHandler.default = endTagOther + + +class InBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody + # the really-really-really-very crazy mode + __slots__ = ("processSpaceCharacters",) + + def __init__(self, *args, **kwargs): + super(InBodyPhase, self).__init__(*args, **kwargs) + # Set this to the default handler + self.processSpaceCharacters = self.processSpaceCharactersNonPre + + def isMatchingFormattingElement(self, node1, node2): + return (node1.name == node2.name and + node1.namespace == node2.namespace and + node1.attributes == node2.attributes) + + # helper + def addFormattingElement(self, token): + self.tree.insertElement(token) + element = self.tree.openElements[-1] - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + matchingElements = [] + for node in self.tree.activeFormattingElements[::-1]: + if node is Marker: + break + elif self.isMatchingFormattingElement(node, element): + matchingElements.append(node) + + assert len(matchingElements) <= 3 + if len(matchingElements) == 3: + self.tree.activeFormattingElements.remove(matchingElements[-1]) + self.tree.activeFormattingElements.append(element) + + # the real deal + def processEOF(self): + allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", + "tfoot", "th", "thead", "tr", "body", + "html")) + for node in self.tree.openElements[::-1]: + if node.name not in allowed_elements: + self.parser.parseError("expected-closing-tag-but-got-eof") + break + # Stop parsing + + def processSpaceCharactersDropNewline(self, token): + # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we + # want to drop leading newlines + data = token["data"] + self.processSpaceCharacters = self.processSpaceCharactersNonPre + if (data.startswith("\n") and + self.tree.openElements[-1].name in ("pre", "listing", "textarea") and + not self.tree.openElements[-1].hasContent()): + data = data[1:] + if data: + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(data) + + def processCharacters(self, token): + if token["data"] == "\u0000": + # The tokenizer should always emit null on its own + return + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) + # This must be bad for performance + if (self.parser.framesetOK and + any(char not in spaceCharacters + for char in token["data"])): + self.parser.framesetOK = False - def startTagHead(self, token): - self.parser.parseError("two-heads-are-not-better-than-one") + def processSpaceCharactersNonPre(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertText(token["data"]) - def startTagBaseLinkCommand(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True + def startTagProcessInHead(self, token): + return self.parser.phases["inHead"].processStartTag(token) - def startTagMeta(self, token): + def startTagBody(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "body"}) + if (len(self.tree.openElements) == 1 or + self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + else: + self.parser.framesetOK = False + for attr, value in token["data"].items(): + if attr not in self.tree.openElements[1].attributes: + self.tree.openElements[1].attributes[attr] = value + + def startTagFrameset(self, token): + self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) + if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): + assert self.parser.innerHTML + elif not self.parser.framesetOK: + pass + else: + if self.tree.openElements[1].parent: + self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) + while self.tree.openElements[-1].name != "html": + self.tree.openElements.pop() self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True + self.parser.phase = self.parser.phases["inFrameset"] - attributes = token["data"] - if self.parser.tokenizer.stream.charEncoding[1] == "tentative": - if "charset" in attributes: - self.parser.tokenizer.stream.changeEncoding(attributes["charset"]) - elif ("content" in attributes and - "http-equiv" in attributes and - attributes["http-equiv"].lower() == "content-type"): - # Encoding it as UTF-8 here is a hack, as really we should pass - # the abstract Unicode string, and just use the - # ContentAttrParser on that, but using UTF-8 allows all chars - # to be encoded and as a ASCII-superset works. - data = _inputstream.EncodingBytes(attributes["content"].encode("utf-8")) - parser = _inputstream.ContentAttrParser(data) - codec = parser.parse() - self.parser.tokenizer.stream.changeEncoding(codec) - - def startTagTitle(self, token): - self.parser.parseRCDataRawtext(token, "RCDATA") - - def startTagNoFramesStyle(self, token): - # Need to decide whether to implement the scripting-disabled case - self.parser.parseRCDataRawtext(token, "RAWTEXT") + def startTagCloseP(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) - def startTagNoscript(self, token): - if self.parser.scripting: - self.parser.parseRCDataRawtext(token, "RAWTEXT") - else: - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inHeadNoscript"] + def startTagPreListing(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.processSpaceCharacters = self.processSpaceCharactersDropNewline - def startTagScript(self, token): + def startTagForm(self, token): + if self.tree.formPointer: + self.parser.parseError("unexpected-start-tag", {"name": "form"}) + else: + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.scriptDataState - self.parser.originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["text"] + self.tree.formPointer = self.tree.openElements[-1] - def startTagOther(self, token): - self.anythingElse() - return token + def startTagListItem(self, token): + self.parser.framesetOK = False - def endTagHead(self, token): - node = self.parser.tree.openElements.pop() - assert node.name == "head", "Expected head got %s" % node.name - self.parser.phase = self.parser.phases["afterHead"] + stopNamesMap = {"li": ["li"], + "dt": ["dt", "dd"], + "dd": ["dt", "dd"]} + stopNames = stopNamesMap[token["name"]] + for node in reversed(self.tree.openElements): + if node.name in stopNames: + self.parser.phase.processEndTag( + impliedTagToken(node.name, "EndTag")) + break + if (node.nameTuple in specialElements and + node.name not in ("address", "div", "p")): + break - def endTagHtmlBodyBr(self, token): - self.anythingElse() - return token + if self.tree.elementInScope("p", variant="button"): + self.parser.phase.processEndTag( + impliedTagToken("p", "EndTag")) - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + self.tree.insertElement(token) - def anythingElse(self): - self.endTagHead(impliedTagToken("head")) - - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("title", startTagTitle), - (("noframes", "style"), startTagNoFramesStyle), - ("noscript", startTagNoscript), - ("script", startTagScript), - (("base", "basefont", "bgsound", "command", "link"), - startTagBaseLinkCommand), - ("meta", startTagMeta), - ("head", startTagHead) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("head", endTagHead), - (("br", "html", "body"), endTagHtmlBodyBr) - ]) - endTagHandler.default = endTagOther - - class InHeadNoscriptPhase(Phase): - __slots__ = tuple() - - def processEOF(self): - self.parser.parseError("eof-in-head-noscript") - self.anythingElse() - return True - - def processComment(self, token): - return self.parser.phases["inHead"].processComment(token) - - def processCharacters(self, token): - self.parser.parseError("char-in-head-noscript") - self.anythingElse() - return token + def startTagPlaintext(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.plaintextState - def processSpaceCharacters(self, token): - return self.parser.phases["inHead"].processSpaceCharacters(token) + def startTagHeading(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + if self.tree.openElements[-1].name in headingElements: + self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + self.tree.openElements.pop() + self.tree.insertElement(token) - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagA(self, token): + afeAElement = self.tree.elementInActiveFormattingElements("a") + if afeAElement: + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "a", "endName": "a"}) + self.endTagFormatting(impliedTagToken("a")) + if afeAElement in self.tree.openElements: + self.tree.openElements.remove(afeAElement) + if afeAElement in self.tree.activeFormattingElements: + self.tree.activeFormattingElements.remove(afeAElement) + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) + + def startTagNobr(self, token): + self.tree.reconstructActiveFormattingElements() + if self.tree.elementInScope("nobr"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "nobr", "endName": "nobr"}) + self.processEndTag(impliedTagToken("nobr")) + # XXX Need tests that trigger the following + self.tree.reconstructActiveFormattingElements() + self.addFormattingElement(token) - def startTagBaseLinkCommand(self, token): - return self.parser.phases["inHead"].processStartTag(token) + def startTagButton(self, token): + if self.tree.elementInScope("button"): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "button", "endName": "button"}) + self.processEndTag(impliedTagToken("button")) + return token + else: + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False - def startTagHeadNoscript(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) + def startTagAppletMarqueeObject(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.activeFormattingElements.append(Marker) + self.parser.framesetOK = False + + def startTagXmp(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.reconstructActiveFormattingElements() + self.parser.framesetOK = False + self.parser.parseRCDataRawtext(token, "RAWTEXT") + + def startTagTable(self, token): + if self.parser.compatMode != "quirks": + if self.tree.elementInScope("p", variant="button"): + self.processEndTag(impliedTagToken("p")) + self.tree.insertElement(token) + self.parser.framesetOK = False + self.parser.phase = self.parser.phases["inTable"] - def startTagOther(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) - self.anythingElse() - return token + def startTagVoidFormatting(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagInput(self, token): + framesetOK = self.parser.framesetOK + self.startTagVoidFormatting(token) + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + # input type=hidden doesn't change framesetOK + self.parser.framesetOK = framesetOK + + def startTagParamSource(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - def endTagNoscript(self, token): - node = self.parser.tree.openElements.pop() - assert node.name == "noscript", "Expected noscript got %s" % node.name - self.parser.phase = self.parser.phases["inHead"] + def startTagHr(self, token): + if self.tree.elementInScope("p", variant="button"): + self.endTagP(impliedTagToken("p")) + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True + self.parser.framesetOK = False + + def startTagImage(self, token): + # No really... + self.parser.parseError("unexpected-start-tag-treated-as", + {"originalName": "image", "newName": "img"}) + self.processStartTag(impliedTagToken("img", "StartTag", + attributes=token["data"], + selfClosing=token["selfClosing"])) + + def startTagIsIndex(self, token): + self.parser.parseError("deprecated-tag", {"name": "isindex"}) + if self.tree.formPointer: + return + form_attrs = {} + if "action" in token["data"]: + form_attrs["action"] = token["data"]["action"] + self.processStartTag(impliedTagToken("form", "StartTag", + attributes=form_attrs)) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processStartTag(impliedTagToken("label", "StartTag")) + # XXX Localization ... + if "prompt" in token["data"]: + prompt = token["data"]["prompt"] + else: + prompt = "This is a searchable index. Enter search keywords: " + self.processCharacters( + {"type": tokenTypes["Characters"], "data": prompt}) + attributes = token["data"].copy() + if "action" in attributes: + del attributes["action"] + if "prompt" in attributes: + del attributes["prompt"] + attributes["name"] = "isindex" + self.processStartTag(impliedTagToken("input", "StartTag", + attributes=attributes, + selfClosing=token["selfClosing"])) + self.processEndTag(impliedTagToken("label")) + self.processStartTag(impliedTagToken("hr", "StartTag")) + self.processEndTag(impliedTagToken("form")) + + def startTagTextarea(self, token): + self.tree.insertElement(token) + self.parser.tokenizer.state = self.parser.tokenizer.rcdataState + self.processSpaceCharacters = self.processSpaceCharactersDropNewline + self.parser.framesetOK = False - def endTagBr(self, token): - self.parser.parseError("unexpected-inhead-noscript-tag", {"name": token["name"]}) - self.anythingElse() - return token + def startTagIFrame(self, token): + self.parser.framesetOK = False + self.startTagRawtext(token) - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + def startTagNoscript(self, token): + if self.parser.scripting: + self.startTagRawtext(token) + else: + self.startTagOther(token) - def anythingElse(self): - # Caller must raise parse error first! - self.endTagNoscript(impliedTagToken("noscript")) + def startTagRawtext(self, token): + """iframe, noembed noframes, noscript(if scripting enabled)""" + self.parser.parseRCDataRawtext(token, "RAWTEXT") - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - (("basefont", "bgsound", "link", "meta", "noframes", "style"), startTagBaseLinkCommand), - (("head", "noscript"), startTagHeadNoscript), - ]) - startTagHandler.default = startTagOther + def startTagOpt(self, token): + if self.tree.openElements[-1].name == "option": + self.parser.phase.processEndTag(impliedTagToken("option")) + self.tree.reconstructActiveFormattingElements() + self.parser.tree.insertElement(token) - endTagHandler = _utils.MethodDispatcher([ - ("noscript", endTagNoscript), - ("br", endTagBr), - ]) - endTagHandler.default = endTagOther + def startTagSelect(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) + self.parser.framesetOK = False + if self.parser.phase in (self.parser.phases["inTable"], + self.parser.phases["inCaption"], + self.parser.phases["inColumnGroup"], + self.parser.phases["inTableBody"], + self.parser.phases["inRow"], + self.parser.phases["inCell"]): + self.parser.phase = self.parser.phases["inSelectInTable"] + else: + self.parser.phase = self.parser.phases["inSelect"] - class AfterHeadPhase(Phase): - __slots__ = tuple() + def startTagRpRt(self, token): + if self.tree.elementInScope("ruby"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "ruby": + self.parser.parseError() + self.tree.insertElement(token) - def processEOF(self): - self.anythingElse() - return True + def startTagMath(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustMathMLAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["mathml"] + self.tree.insertElement(token) + # Need to get the parse error right for the case where the token + # has a namespace not equal to the xmlns attribute + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - def processCharacters(self, token): - self.anythingElse() - return token + def startTagSvg(self, token): + self.tree.reconstructActiveFormattingElements() + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = namespaces["svg"] + self.tree.insertElement(token) + # Need to get the parse error right for the case where the token + # has a namespace not equal to the xmlns attribute + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagMisplaced(self, token): + """ Elements that should be children of other elements that have a + different insertion mode; here they are ignored + "caption", "col", "colgroup", "frame", "frameset", "head", + "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", + "tr", "noscript" + """ + self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - def startTagBody(self, token): - self.parser.framesetOK = False - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inBody"] + def startTagOther(self, token): + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(token) - def startTagFrameset(self, token): - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inFrameset"] + def endTagP(self, token): + if not self.tree.elementInScope("p", variant="button"): + self.startTagCloseP(impliedTagToken("p", "StartTag")) + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + self.endTagP(impliedTagToken("p", "EndTag")) + else: + self.tree.generateImpliedEndTags("p") + if self.tree.openElements[-1].name != "p": + self.parser.parseError("unexpected-end-tag", {"name": "p"}) + node = self.tree.openElements.pop() + while node.name != "p": + node = self.tree.openElements.pop() - def startTagFromHead(self, token): - self.parser.parseError("unexpected-start-tag-out-of-my-head", - {"name": token["name"]}) - self.tree.openElements.append(self.tree.headPointer) - self.parser.phases["inHead"].processStartTag(token) - for node in self.tree.openElements[::-1]: - if node.name == "head": - self.tree.openElements.remove(node) + def endTagBody(self, token): + if not self.tree.elementInScope("body"): + self.parser.parseError() + return + elif self.tree.openElements[-1].name != "body": + for node in self.tree.openElements[2:]: + if node.name not in frozenset(("dd", "dt", "li", "optgroup", + "option", "p", "rp", "rt", + "tbody", "td", "tfoot", + "th", "thead", "tr", "body", + "html")): + # Not sure this is the correct name for the parse error + self.parser.parseError( + "expected-one-end-tag-but-got-another", + {"gotName": "body", "expectedName": node.name}) break + self.parser.phase = self.parser.phases["afterBody"] - def startTagHead(self, token): - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) - - def startTagOther(self, token): - self.anythingElse() + def endTagHtml(self, token): + # We repeat the test for the body end tag token being ignored here + if self.tree.elementInScope("body"): + self.endTagBody(impliedTagToken("body")) return token - def endTagHtmlBodyBr(self, token): - self.anythingElse() - return token + def endTagBlock(self, token): + # Put us back in the right whitespace handling mode + if token["name"] == "pre": + self.processSpaceCharacters = self.processSpaceCharactersNonPre + inScope = self.tree.elementInScope(token["name"]) + if inScope: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) + if inScope: + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + def endTagForm(self, token): + node = self.tree.formPointer + self.tree.formPointer = None + if node is None or not self.tree.elementInScope(node): + self.parser.parseError("unexpected-end-tag", + {"name": "form"}) + else: + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1] != node: + self.parser.parseError("end-tag-too-early-ignored", + {"name": "form"}) + self.tree.openElements.remove(node) - def anythingElse(self): - self.tree.insertElement(impliedTagToken("body", "StartTag")) - self.parser.phase = self.parser.phases["inBody"] - self.parser.framesetOK = True - - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("body", startTagBody), - ("frameset", startTagFrameset), - (("base", "basefont", "bgsound", "link", "meta", "noframes", "script", - "style", "title"), - startTagFromHead), - ("head", startTagHead) - ]) - startTagHandler.default = startTagOther - endTagHandler = _utils.MethodDispatcher([(("body", "html", "br"), - endTagHtmlBodyBr)]) - endTagHandler.default = endTagOther - - class InBodyPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#parsing-main-inbody - # the really-really-really-very crazy mode - __slots__ = ("processSpaceCharacters",) - - def __init__(self, *args, **kwargs): - super(InBodyPhase, self).__init__(*args, **kwargs) - # Set this to the default handler - self.processSpaceCharacters = self.processSpaceCharactersNonPre + def endTagListItem(self, token): + if token["name"] == "li": + variant = "list" + else: + variant = None + if not self.tree.elementInScope(token["name"], variant=variant): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + else: + self.tree.generateImpliedEndTags(exclude=token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError( + "end-tag-too-early", + {"name": token["name"]}) + node = self.tree.openElements.pop() + while node.name != token["name"]: + node = self.tree.openElements.pop() - def isMatchingFormattingElement(self, node1, node2): - return (node1.name == node2.name and - node1.namespace == node2.namespace and - node1.attributes == node2.attributes) + def endTagHeading(self, token): + for item in headingElements: + if self.tree.elementInScope(item): + self.tree.generateImpliedEndTags() + break + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - # helper - def addFormattingElement(self, token): - self.tree.insertElement(token) - element = self.tree.openElements[-1] + for item in headingElements: + if self.tree.elementInScope(item): + item = self.tree.openElements.pop() + while item.name not in headingElements: + item = self.tree.openElements.pop() + break - matchingElements = [] - for node in self.tree.activeFormattingElements[::-1]: - if node is Marker: - break - elif self.isMatchingFormattingElement(node, element): - matchingElements.append(node) - - assert len(matchingElements) <= 3 - if len(matchingElements) == 3: - self.tree.activeFormattingElements.remove(matchingElements[-1]) - self.tree.activeFormattingElements.append(element) - - # the real deal - def processEOF(self): - allowed_elements = frozenset(("dd", "dt", "li", "p", "tbody", "td", - "tfoot", "th", "thead", "tr", "body", - "html")) - for node in self.tree.openElements[::-1]: - if node.name not in allowed_elements: - self.parser.parseError("expected-closing-tag-but-got-eof") - break - # Stop parsing + def endTagFormatting(self, token): + """The much-feared adoption agency algorithm""" + # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 + # XXX Better parseError messages appreciated. + + # Step 1 + outerLoopCounter = 0 + + # Step 2 + while outerLoopCounter < 8: + + # Step 3 + outerLoopCounter += 1 + + # Step 4: + + # Let the formatting element be the last element in + # the list of active formatting elements that: + # - is between the end of the list and the last scope + # marker in the list, if any, or the start of the list + # otherwise, and + # - has the same tag name as the token. + formattingElement = self.tree.elementInActiveFormattingElements( + token["name"]) + if (not formattingElement or + (formattingElement in self.tree.openElements and + not self.tree.elementInScope(formattingElement.name))): + # If there is no such node, then abort these steps + # and instead act as described in the "any other + # end tag" entry below. + self.endTagOther(token) + return - def processSpaceCharactersDropNewline(self, token): - # Sometimes (start of <pre>, <listing>, and <textarea> blocks) we - # want to drop leading newlines - data = token["data"] - self.processSpaceCharacters = self.processSpaceCharactersNonPre - if (data.startswith("\n") and - self.tree.openElements[-1].name in ("pre", "listing", "textarea") and - not self.tree.openElements[-1].hasContent()): - data = data[1:] - if data: - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(data) - - def processCharacters(self, token): - if token["data"] == "\u0000": - # The tokenizer should always emit null on its own + # Otherwise, if there is such a node, but that node is + # not in the stack of open elements, then this is a + # parse error; remove the element from the list, and + # abort these steps. + elif formattingElement not in self.tree.openElements: + self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) + self.tree.activeFormattingElements.remove(formattingElement) return - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(token["data"]) - # This must be bad for performance - if (self.parser.framesetOK and - any(char not in spaceCharacters - for char in token["data"])): - self.parser.framesetOK = False - - def processSpaceCharactersNonPre(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertText(token["data"]) - def startTagProcessInHead(self, token): - return self.parser.phases["inHead"].processStartTag(token) + # Otherwise, if there is such a node, and that node is + # also in the stack of open elements, but the element + # is not in scope, then this is a parse error; ignore + # the token, and abort these steps. + elif not self.tree.elementInScope(formattingElement.name): + self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) + return - def startTagBody(self, token): - self.parser.parseError("unexpected-start-tag", {"name": "body"}) - if (len(self.tree.openElements) == 1 or - self.tree.openElements[1].name != "body"): - assert self.parser.innerHTML - else: - self.parser.framesetOK = False - for attr, value in token["data"].items(): - if attr not in self.tree.openElements[1].attributes: - self.tree.openElements[1].attributes[attr] = value - - def startTagFrameset(self, token): - self.parser.parseError("unexpected-start-tag", {"name": "frameset"}) - if (len(self.tree.openElements) == 1 or self.tree.openElements[1].name != "body"): - assert self.parser.innerHTML - elif not self.parser.framesetOK: - pass + # Otherwise, there is a formatting element and that + # element is in the stack and is in scope. If the + # element is not the current node, this is a parse + # error. In any case, proceed with the algorithm as + # written in the following steps. else: - if self.tree.openElements[1].parent: - self.tree.openElements[1].parent.removeChild(self.tree.openElements[1]) - while self.tree.openElements[-1].name != "html": - self.tree.openElements.pop() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inFrameset"] - - def startTagCloseP(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) + if formattingElement != self.tree.openElements[-1]: + self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) + + # Step 5: + + # Let the furthest block be the topmost node in the + # stack of open elements that is lower in the stack + # than the formatting element, and is an element in + # the special category. There might not be one. + afeIndex = self.tree.openElements.index(formattingElement) + furthestBlock = None + for element in self.tree.openElements[afeIndex:]: + if element.nameTuple in specialElements: + furthestBlock = element + break - def startTagPreListing(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.framesetOK = False - self.processSpaceCharacters = self.processSpaceCharactersDropNewline + # Step 6: + + # If there is no furthest block, then the UA must + # first pop all the nodes from the bottom of the stack + # of open elements, from the current node up to and + # including the formatting element, then remove the + # formatting element from the list of active + # formatting elements, and finally abort these steps. + if furthestBlock is None: + element = self.tree.openElements.pop() + while element != formattingElement: + element = self.tree.openElements.pop() + self.tree.activeFormattingElements.remove(element) + return - def startTagForm(self, token): - if self.tree.formPointer: - self.parser.parseError("unexpected-start-tag", {"name": "form"}) + # Step 7 + commonAncestor = self.tree.openElements[afeIndex - 1] + + # Step 8: + # The bookmark is supposed to help us identify where to reinsert + # nodes in step 15. We have to ensure that we reinsert nodes after + # the node before the active formatting element. Note the bookmark + # can move in step 9.7 + bookmark = self.tree.activeFormattingElements.index(formattingElement) + + # Step 9 + lastNode = node = furthestBlock + innerLoopCounter = 0 + + index = self.tree.openElements.index(node) + while innerLoopCounter < 3: + innerLoopCounter += 1 + # Node is element before node in open elements + index -= 1 + node = self.tree.openElements[index] + if node not in self.tree.activeFormattingElements: + self.tree.openElements.remove(node) + continue + # Step 9.6 + if node == formattingElement: + break + # Step 9.7 + if lastNode == furthestBlock: + bookmark = self.tree.activeFormattingElements.index(node) + 1 + # Step 9.8 + clone = node.cloneNode() + # Replace node with clone + self.tree.activeFormattingElements[ + self.tree.activeFormattingElements.index(node)] = clone + self.tree.openElements[ + self.tree.openElements.index(node)] = clone + node = clone + # Step 9.9 + # Remove lastNode from its parents, if any + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + node.appendChild(lastNode) + # Step 9.10 + lastNode = node + + # Step 10 + # Foster parent lastNode if commonAncestor is a + # table, tbody, tfoot, thead, or tr we need to foster + # parent the lastNode + if lastNode.parent: + lastNode.parent.removeChild(lastNode) + + if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): + parent, insertBefore = self.tree.getTableMisnestedNodePosition() + parent.insertBefore(lastNode, insertBefore) else: - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.tree.formPointer = self.tree.openElements[-1] + commonAncestor.appendChild(lastNode) - def startTagListItem(self, token): - self.parser.framesetOK = False + # Step 11 + clone = formattingElement.cloneNode() - stopNamesMap = {"li": ["li"], - "dt": ["dt", "dd"], - "dd": ["dt", "dd"]} - stopNames = stopNamesMap[token["name"]] - for node in reversed(self.tree.openElements): - if node.name in stopNames: - self.parser.phase.processEndTag( - impliedTagToken(node.name, "EndTag")) - break - if (node.nameTuple in specialElements and - node.name not in ("address", "div", "p")): - break + # Step 12 + furthestBlock.reparentChildren(clone) - if self.tree.elementInScope("p", variant="button"): - self.parser.phase.processEndTag( - impliedTagToken("p", "EndTag")) + # Step 13 + furthestBlock.appendChild(clone) - self.tree.insertElement(token) + # Step 14 + self.tree.activeFormattingElements.remove(formattingElement) + self.tree.activeFormattingElements.insert(bookmark, clone) - def startTagPlaintext(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.plaintextState + # Step 15 + self.tree.openElements.remove(formattingElement) + self.tree.openElements.insert( + self.tree.openElements.index(furthestBlock) + 1, clone) - def startTagHeading(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - if self.tree.openElements[-1].name in headingElements: - self.parser.parseError("unexpected-start-tag", {"name": token["name"]}) - self.tree.openElements.pop() - self.tree.insertElement(token) + def endTagAppletMarqueeObject(self, token): + if self.tree.elementInScope(token["name"]): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - def startTagA(self, token): - afeAElement = self.tree.elementInActiveFormattingElements("a") - if afeAElement: - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "a", "endName": "a"}) - self.endTagFormatting(impliedTagToken("a")) - if afeAElement in self.tree.openElements: - self.tree.openElements.remove(afeAElement) - if afeAElement in self.tree.activeFormattingElements: - self.tree.activeFormattingElements.remove(afeAElement) - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) + if self.tree.elementInScope(token["name"]): + element = self.tree.openElements.pop() + while element.name != token["name"]: + element = self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() - def startTagFormatting(self, token): - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) + def endTagBr(self, token): + self.parser.parseError("unexpected-end-tag-treated-as", + {"originalName": "br", "newName": "br element"}) + self.tree.reconstructActiveFormattingElements() + self.tree.insertElement(impliedTagToken("br", "StartTag")) + self.tree.openElements.pop() - def startTagNobr(self, token): - self.tree.reconstructActiveFormattingElements() - if self.tree.elementInScope("nobr"): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "nobr", "endName": "nobr"}) - self.processEndTag(impliedTagToken("nobr")) - # XXX Need tests that trigger the following - self.tree.reconstructActiveFormattingElements() - self.addFormattingElement(token) - - def startTagButton(self, token): - if self.tree.elementInScope("button"): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "button", "endName": "button"}) - self.processEndTag(impliedTagToken("button")) - return token + def endTagOther(self, token): + for node in self.tree.openElements[::-1]: + if node.name == token["name"]: + self.tree.generateImpliedEndTags(exclude=token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + while self.tree.openElements.pop() != node: + pass + break else: - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.parser.framesetOK = False + if node.nameTuple in specialElements: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + break - def startTagAppletMarqueeObject(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.tree.activeFormattingElements.append(Marker) - self.parser.framesetOK = False + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("base", "basefont", "bgsound", "command", "link", "meta", + "script", "style", "title"), + startTagProcessInHead), + ("body", startTagBody), + ("frameset", startTagFrameset), + (("address", "article", "aside", "blockquote", "center", "details", + "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", + "section", "summary", "ul"), + startTagCloseP), + (headingElements, startTagHeading), + (("pre", "listing"), startTagPreListing), + ("form", startTagForm), + (("li", "dd", "dt"), startTagListItem), + ("plaintext", startTagPlaintext), + ("a", startTagA), + (("b", "big", "code", "em", "font", "i", "s", "small", "strike", + "strong", "tt", "u"), startTagFormatting), + ("nobr", startTagNobr), + ("button", startTagButton), + (("applet", "marquee", "object"), startTagAppletMarqueeObject), + ("xmp", startTagXmp), + ("table", startTagTable), + (("area", "br", "embed", "img", "keygen", "wbr"), + startTagVoidFormatting), + (("param", "source", "track"), startTagParamSource), + ("input", startTagInput), + ("hr", startTagHr), + ("image", startTagImage), + ("isindex", startTagIsIndex), + ("textarea", startTagTextarea), + ("iframe", startTagIFrame), + ("noscript", startTagNoscript), + (("noembed", "noframes"), startTagRawtext), + ("select", startTagSelect), + (("rp", "rt"), startTagRpRt), + (("option", "optgroup"), startTagOpt), + (("math"), startTagMath), + (("svg"), startTagSvg), + (("caption", "col", "colgroup", "frame", "head", + "tbody", "td", "tfoot", "th", "thead", + "tr"), startTagMisplaced) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("body", endTagBody), + ("html", endTagHtml), + (("address", "article", "aside", "blockquote", "button", "center", + "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", + "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", + "section", "summary", "ul"), endTagBlock), + ("form", endTagForm), + ("p", endTagP), + (("dd", "dt", "li"), endTagListItem), + (headingElements, endTagHeading), + (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", + "strike", "strong", "tt", "u"), endTagFormatting), + (("applet", "marquee", "object"), endTagAppletMarqueeObject), + ("br", endTagBr), + ]) + endTagHandler.default = endTagOther + + +class TextPhase(Phase): + __slots__ = tuple() + + def processCharacters(self, token): + self.tree.insertText(token["data"]) + + def processEOF(self): + self.parser.parseError("expected-named-closing-tag-but-got-eof", + {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + return True + + def startTagOther(self, token): + assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] + + def endTagScript(self, token): + node = self.tree.openElements.pop() + assert node.name == "script" + self.parser.phase = self.parser.originalPhase + # The rest of this method is all stuff that only happens if + # document.write works + + def endTagOther(self, token): + self.tree.openElements.pop() + self.parser.phase = self.parser.originalPhase + + startTagHandler = _utils.MethodDispatcher([]) + startTagHandler.default = startTagOther + endTagHandler = _utils.MethodDispatcher([ + ("script", endTagScript)]) + endTagHandler.default = endTagOther + + +class InTablePhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table + __slots__ = tuple() + + # helper methods + def clearStackToTableContext(self): + # "clear the stack back to a table context" + while self.tree.openElements[-1].name not in ("table", "html"): + # self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + # When the current node is <html> it's an innerHTML case - def startTagXmp(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) - self.tree.reconstructActiveFormattingElements() - self.parser.framesetOK = False - self.parser.parseRCDataRawtext(token, "RAWTEXT") + # processing methods + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-table") + else: + assert self.parser.innerHTML + # Stop parsing + + def processSpaceCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.processSpaceCharacters(token) + + def processCharacters(self, token): + originalPhase = self.parser.phase + self.parser.phase = self.parser.phases["inTableText"] + self.parser.phase.originalPhase = originalPhase + self.parser.phase.processCharacters(token) + + def insertText(self, token): + # If we get here there must be at least one non-whitespace character + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processCharacters(token) + self.tree.insertFromTable = False + + def startTagCaption(self, token): + self.clearStackToTableContext() + self.tree.activeFormattingElements.append(Marker) + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCaption"] - def startTagTable(self, token): - if self.parser.compatMode != "quirks": - if self.tree.elementInScope("p", variant="button"): - self.processEndTag(impliedTagToken("p")) - self.tree.insertElement(token) - self.parser.framesetOK = False - self.parser.phase = self.parser.phases["inTable"] + def startTagColgroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inColumnGroup"] - def startTagVoidFormatting(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - self.parser.framesetOK = False + def startTagCol(self, token): + self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) + return token - def startTagInput(self, token): - framesetOK = self.parser.framesetOK - self.startTagVoidFormatting(token) - if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): - # input type=hidden doesn't change framesetOK - self.parser.framesetOK = framesetOK + def startTagRowGroup(self, token): + self.clearStackToTableContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inTableBody"] - def startTagParamSource(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True + def startTagImplyTbody(self, token): + self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + return token - def startTagHr(self, token): - if self.tree.elementInScope("p", variant="button"): - self.endTagP(impliedTagToken("p")) + def startTagTable(self, token): + self.parser.parseError("unexpected-start-tag-implies-end-tag", + {"startName": "table", "endName": "table"}) + self.parser.phase.processEndTag(impliedTagToken("table")) + if not self.parser.innerHTML: + return token + + def startTagStyleScript(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagInput(self, token): + if ("type" in token["data"] and + token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): + self.parser.parseError("unexpected-hidden-input-in-table") self.tree.insertElement(token) + # XXX associate with form self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - self.parser.framesetOK = False + else: + self.startTagOther(token) - def startTagImage(self, token): - # No really... - self.parser.parseError("unexpected-start-tag-treated-as", - {"originalName": "image", "newName": "img"}) - self.processStartTag(impliedTagToken("img", "StartTag", - attributes=token["data"], - selfClosing=token["selfClosing"])) - - def startTagIsIndex(self, token): - self.parser.parseError("deprecated-tag", {"name": "isindex"}) - if self.tree.formPointer: - return - form_attrs = {} - if "action" in token["data"]: - form_attrs["action"] = token["data"]["action"] - self.processStartTag(impliedTagToken("form", "StartTag", - attributes=form_attrs)) - self.processStartTag(impliedTagToken("hr", "StartTag")) - self.processStartTag(impliedTagToken("label", "StartTag")) - # XXX Localization ... - if "prompt" in token["data"]: - prompt = token["data"]["prompt"] - else: - prompt = "This is a searchable index. Enter search keywords: " - self.processCharacters( - {"type": tokenTypes["Characters"], "data": prompt}) - attributes = token["data"].copy() - if "action" in attributes: - del attributes["action"] - if "prompt" in attributes: - del attributes["prompt"] - attributes["name"] = "isindex" - self.processStartTag(impliedTagToken("input", "StartTag", - attributes=attributes, - selfClosing=token["selfClosing"])) - self.processEndTag(impliedTagToken("label")) - self.processStartTag(impliedTagToken("hr", "StartTag")) - self.processEndTag(impliedTagToken("form")) - - def startTagTextarea(self, token): + def startTagForm(self, token): + self.parser.parseError("unexpected-form-in-table") + if self.tree.formPointer is None: self.tree.insertElement(token) - self.parser.tokenizer.state = self.parser.tokenizer.rcdataState - self.processSpaceCharacters = self.processSpaceCharactersDropNewline - self.parser.framesetOK = False + self.tree.formPointer = self.tree.openElements[-1] + self.tree.openElements.pop() - def startTagIFrame(self, token): - self.parser.framesetOK = False - self.startTagRawtext(token) + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processStartTag(token) + self.tree.insertFromTable = False + + def endTagTable(self, token): + if self.tree.elementInScope("table", variant="table"): + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "table": + self.parser.parseError("end-tag-too-early-named", + {"gotName": "table", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "table": + self.tree.openElements.pop() + self.tree.openElements.pop() + self.parser.resetInsertionMode() + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() - def startTagNoscript(self, token): - if self.parser.scripting: - self.startTagRawtext(token) - else: - self.startTagOther(token) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) + # Do the table magic! + self.tree.insertFromTable = True + self.parser.phases["inBody"].processEndTag(token) + self.tree.insertFromTable = False + + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("caption", startTagCaption), + ("colgroup", startTagColgroup), + ("col", startTagCol), + (("tbody", "tfoot", "thead"), startTagRowGroup), + (("td", "th", "tr"), startTagImplyTbody), + ("table", startTagTable), + (("style", "script"), startTagStyleScript), + ("input", startTagInput), + ("form", startTagForm) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("table", endTagTable), + (("body", "caption", "col", "colgroup", "html", "tbody", "td", + "tfoot", "th", "thead", "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther + + +class InTableTextPhase(Phase): + __slots__ = ("originalPhase", "characterTokens") + + def __init__(self, *args, **kwargs): + super(InTableTextPhase, self).__init__(*args, **kwargs) + self.originalPhase = None + self.characterTokens = [] + + def flushCharacters(self): + data = "".join([item["data"] for item in self.characterTokens]) + if any(item not in spaceCharacters for item in data): + token = {"type": tokenTypes["Characters"], "data": data} + self.parser.phases["inTable"].insertText(token) + elif data: + self.tree.insertText(data) + self.characterTokens = [] + + def processComment(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + def processEOF(self): + self.flushCharacters() + self.parser.phase = self.originalPhase + return True + + def processCharacters(self, token): + if token["data"] == "\u0000": + return + self.characterTokens.append(token) + + def processSpaceCharacters(self, token): + # pretty sure we should never reach here + self.characterTokens.append(token) +# assert False + + def processStartTag(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + def processEndTag(self, token): + self.flushCharacters() + self.parser.phase = self.originalPhase + return token + + +class InCaptionPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-caption + __slots__ = tuple() + + def ignoreEndTagCaption(self): + return not self.tree.elementInScope("caption", variant="table") + + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + return self.parser.phases["inBody"].processCharacters(token) + + def startTagTableElement(self, token): + self.parser.parseError() + # XXX Have to duplicate logic here to find out if the tag is ignored + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + return token - def startTagRawtext(self, token): - """iframe, noembed noframes, noscript(if scripting enabled)""" - self.parser.parseRCDataRawtext(token, "RAWTEXT") - - def startTagOpt(self, token): - if self.tree.openElements[-1].name == "option": - self.parser.phase.processEndTag(impliedTagToken("option")) - self.tree.reconstructActiveFormattingElements() - self.parser.tree.insertElement(token) - - def startTagSelect(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - self.parser.framesetOK = False - if self.parser.phase in (self.parser.phases["inTable"], - self.parser.phases["inCaption"], - self.parser.phases["inColumnGroup"], - self.parser.phases["inTableBody"], - self.parser.phases["inRow"], - self.parser.phases["inCell"]): - self.parser.phase = self.parser.phases["inSelectInTable"] - else: - self.parser.phase = self.parser.phases["inSelect"] - - def startTagRpRt(self, token): - if self.tree.elementInScope("ruby"): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "ruby": - self.parser.parseError() - self.tree.insertElement(token) - - def startTagMath(self, token): - self.tree.reconstructActiveFormattingElements() - self.parser.adjustMathMLAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = namespaces["mathml"] - self.tree.insertElement(token) - # Need to get the parse error right for the case where the token - # has a namespace not equal to the xmlns attribute - if token["selfClosing"]: + def startTagOther(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def endTagCaption(self, token): + if not self.ignoreEndTagCaption(): + # AT this code is quite similar to endTagTable in "InTable" + self.tree.generateImpliedEndTags() + if self.tree.openElements[-1].name != "caption": + self.parser.parseError("expected-one-end-tag-but-got-another", + {"gotName": "caption", + "expectedName": self.tree.openElements[-1].name}) + while self.tree.openElements[-1].name != "caption": self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagSvg(self, token): - self.tree.reconstructActiveFormattingElements() - self.parser.adjustSVGAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = namespaces["svg"] - self.tree.insertElement(token) - # Need to get the parse error right for the case where the token - # has a namespace not equal to the xmlns attribute - if token["selfClosing"]: - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def startTagMisplaced(self, token): - """ Elements that should be children of other elements that have a - different insertion mode; here they are ignored - "caption", "col", "colgroup", "frame", "frameset", "head", - "option", "optgroup", "tbody", "td", "tfoot", "th", "thead", - "tr", "noscript" - """ - self.parser.parseError("unexpected-start-tag-ignored", {"name": token["name"]}) - - def startTagOther(self, token): - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(token) - - def endTagP(self, token): - if not self.tree.elementInScope("p", variant="button"): - self.startTagCloseP(impliedTagToken("p", "StartTag")) - self.parser.parseError("unexpected-end-tag", {"name": "p"}) - self.endTagP(impliedTagToken("p", "EndTag")) - else: - self.tree.generateImpliedEndTags("p") - if self.tree.openElements[-1].name != "p": - self.parser.parseError("unexpected-end-tag", {"name": "p"}) - node = self.tree.openElements.pop() - while node.name != "p": - node = self.tree.openElements.pop() - - def endTagBody(self, token): - if not self.tree.elementInScope("body"): - self.parser.parseError() - return - elif self.tree.openElements[-1].name != "body": - for node in self.tree.openElements[2:]: - if node.name not in frozenset(("dd", "dt", "li", "optgroup", - "option", "p", "rp", "rt", - "tbody", "td", "tfoot", - "th", "thead", "tr", "body", - "html")): - # Not sure this is the correct name for the parse error - self.parser.parseError( - "expected-one-end-tag-but-got-another", - {"gotName": "body", "expectedName": node.name}) - break - self.parser.phase = self.parser.phases["afterBody"] - - def endTagHtml(self, token): - # We repeat the test for the body end tag token being ignored here - if self.tree.elementInScope("body"): - self.endTagBody(impliedTagToken("body")) - return token - - def endTagBlock(self, token): - # Put us back in the right whitespace handling mode - if token["name"] == "pre": - self.processSpaceCharacters = self.processSpaceCharactersNonPre - inScope = self.tree.elementInScope(token["name"]) - if inScope: - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - if inScope: - node = self.tree.openElements.pop() - while node.name != token["name"]: - node = self.tree.openElements.pop() - - def endTagForm(self, token): - node = self.tree.formPointer - self.tree.formPointer = None - if node is None or not self.tree.elementInScope(node): - self.parser.parseError("unexpected-end-tag", - {"name": "form"}) - else: - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1] != node: - self.parser.parseError("end-tag-too-early-ignored", - {"name": "form"}) - self.tree.openElements.remove(node) - - def endTagListItem(self, token): - if token["name"] == "li": - variant = "list" - else: - variant = None - if not self.tree.elementInScope(token["name"], variant=variant): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - else: - self.tree.generateImpliedEndTags(exclude=token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError( - "end-tag-too-early", - {"name": token["name"]}) - node = self.tree.openElements.pop() - while node.name != token["name"]: - node = self.tree.openElements.pop() - - def endTagHeading(self, token): - for item in headingElements: - if self.tree.elementInScope(item): - self.tree.generateImpliedEndTags() - break - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - - for item in headingElements: - if self.tree.elementInScope(item): - item = self.tree.openElements.pop() - while item.name not in headingElements: - item = self.tree.openElements.pop() - break - - def endTagFormatting(self, token): - """The much-feared adoption agency algorithm""" - # http://svn.whatwg.org/webapps/complete.html#adoptionAgency revision 7867 - # XXX Better parseError messages appreciated. - - # Step 1 - outerLoopCounter = 0 - - # Step 2 - while outerLoopCounter < 8: - - # Step 3 - outerLoopCounter += 1 - - # Step 4: - - # Let the formatting element be the last element in - # the list of active formatting elements that: - # - is between the end of the list and the last scope - # marker in the list, if any, or the start of the list - # otherwise, and - # - has the same tag name as the token. - formattingElement = self.tree.elementInActiveFormattingElements( - token["name"]) - if (not formattingElement or - (formattingElement in self.tree.openElements and - not self.tree.elementInScope(formattingElement.name))): - # If there is no such node, then abort these steps - # and instead act as described in the "any other - # end tag" entry below. - self.endTagOther(token) - return - - # Otherwise, if there is such a node, but that node is - # not in the stack of open elements, then this is a - # parse error; remove the element from the list, and - # abort these steps. - elif formattingElement not in self.tree.openElements: - self.parser.parseError("adoption-agency-1.2", {"name": token["name"]}) - self.tree.activeFormattingElements.remove(formattingElement) - return - - # Otherwise, if there is such a node, and that node is - # also in the stack of open elements, but the element - # is not in scope, then this is a parse error; ignore - # the token, and abort these steps. - elif not self.tree.elementInScope(formattingElement.name): - self.parser.parseError("adoption-agency-4.4", {"name": token["name"]}) - return - - # Otherwise, there is a formatting element and that - # element is in the stack and is in scope. If the - # element is not the current node, this is a parse - # error. In any case, proceed with the algorithm as - # written in the following steps. - else: - if formattingElement != self.tree.openElements[-1]: - self.parser.parseError("adoption-agency-1.3", {"name": token["name"]}) - - # Step 5: - - # Let the furthest block be the topmost node in the - # stack of open elements that is lower in the stack - # than the formatting element, and is an element in - # the special category. There might not be one. - afeIndex = self.tree.openElements.index(formattingElement) - furthestBlock = None - for element in self.tree.openElements[afeIndex:]: - if element.nameTuple in specialElements: - furthestBlock = element - break - - # Step 6: - - # If there is no furthest block, then the UA must - # first pop all the nodes from the bottom of the stack - # of open elements, from the current node up to and - # including the formatting element, then remove the - # formatting element from the list of active - # formatting elements, and finally abort these steps. - if furthestBlock is None: - element = self.tree.openElements.pop() - while element != formattingElement: - element = self.tree.openElements.pop() - self.tree.activeFormattingElements.remove(element) - return - - # Step 7 - commonAncestor = self.tree.openElements[afeIndex - 1] - - # Step 8: - # The bookmark is supposed to help us identify where to reinsert - # nodes in step 15. We have to ensure that we reinsert nodes after - # the node before the active formatting element. Note the bookmark - # can move in step 9.7 - bookmark = self.tree.activeFormattingElements.index(formattingElement) - - # Step 9 - lastNode = node = furthestBlock - innerLoopCounter = 0 - - index = self.tree.openElements.index(node) - while innerLoopCounter < 3: - innerLoopCounter += 1 - # Node is element before node in open elements - index -= 1 - node = self.tree.openElements[index] - if node not in self.tree.activeFormattingElements: - self.tree.openElements.remove(node) - continue - # Step 9.6 - if node == formattingElement: - break - # Step 9.7 - if lastNode == furthestBlock: - bookmark = self.tree.activeFormattingElements.index(node) + 1 - # Step 9.8 - clone = node.cloneNode() - # Replace node with clone - self.tree.activeFormattingElements[ - self.tree.activeFormattingElements.index(node)] = clone - self.tree.openElements[ - self.tree.openElements.index(node)] = clone - node = clone - # Step 9.9 - # Remove lastNode from its parents, if any - if lastNode.parent: - lastNode.parent.removeChild(lastNode) - node.appendChild(lastNode) - # Step 9.10 - lastNode = node - - # Step 10 - # Foster parent lastNode if commonAncestor is a - # table, tbody, tfoot, thead, or tr we need to foster - # parent the lastNode - if lastNode.parent: - lastNode.parent.removeChild(lastNode) - - if commonAncestor.name in frozenset(("table", "tbody", "tfoot", "thead", "tr")): - parent, insertBefore = self.tree.getTableMisnestedNodePosition() - parent.insertBefore(lastNode, insertBefore) - else: - commonAncestor.appendChild(lastNode) - - # Step 11 - clone = formattingElement.cloneNode() - - # Step 12 - furthestBlock.reparentChildren(clone) - - # Step 13 - furthestBlock.appendChild(clone) - - # Step 14 - self.tree.activeFormattingElements.remove(formattingElement) - self.tree.activeFormattingElements.insert(bookmark, clone) - - # Step 15 - self.tree.openElements.remove(formattingElement) - self.tree.openElements.insert( - self.tree.openElements.index(furthestBlock) + 1, clone) - - def endTagAppletMarqueeObject(self, token): - if self.tree.elementInScope(token["name"]): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("end-tag-too-early", {"name": token["name"]}) - - if self.tree.elementInScope(token["name"]): - element = self.tree.openElements.pop() - while element.name != token["name"]: - element = self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - - def endTagBr(self, token): - self.parser.parseError("unexpected-end-tag-treated-as", - {"originalName": "br", "newName": "br element"}) - self.tree.reconstructActiveFormattingElements() - self.tree.insertElement(impliedTagToken("br", "StartTag")) self.tree.openElements.pop() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inTable"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() - def endTagOther(self, token): - for node in self.tree.openElements[::-1]: - if node.name == token["name"]: - self.tree.generateImpliedEndTags(exclude=token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - while self.tree.openElements.pop() != node: - pass - break - else: - if node.nameTuple in specialElements: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - break + def endTagTable(self, token): + self.parser.parseError() + ignoreEndTag = self.ignoreEndTagCaption() + self.parser.phase.processEndTag(impliedTagToken("caption")) + if not ignoreEndTag: + return token - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - (("base", "basefont", "bgsound", "command", "link", "meta", - "script", "style", "title"), - startTagProcessInHead), - ("body", startTagBody), - ("frameset", startTagFrameset), - (("address", "article", "aside", "blockquote", "center", "details", - "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "main", "menu", "nav", "ol", "p", - "section", "summary", "ul"), - startTagCloseP), - (headingElements, startTagHeading), - (("pre", "listing"), startTagPreListing), - ("form", startTagForm), - (("li", "dd", "dt"), startTagListItem), - ("plaintext", startTagPlaintext), - ("a", startTagA), - (("b", "big", "code", "em", "font", "i", "s", "small", "strike", - "strong", "tt", "u"), startTagFormatting), - ("nobr", startTagNobr), - ("button", startTagButton), - (("applet", "marquee", "object"), startTagAppletMarqueeObject), - ("xmp", startTagXmp), - ("table", startTagTable), - (("area", "br", "embed", "img", "keygen", "wbr"), - startTagVoidFormatting), - (("param", "source", "track"), startTagParamSource), - ("input", startTagInput), - ("hr", startTagHr), - ("image", startTagImage), - ("isindex", startTagIsIndex), - ("textarea", startTagTextarea), - ("iframe", startTagIFrame), - ("noscript", startTagNoscript), - (("noembed", "noframes"), startTagRawtext), - ("select", startTagSelect), - (("rp", "rt"), startTagRpRt), - (("option", "optgroup"), startTagOpt), - (("math"), startTagMath), - (("svg"), startTagSvg), - (("caption", "col", "colgroup", "frame", "head", - "tbody", "td", "tfoot", "th", "thead", - "tr"), startTagMisplaced) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("body", endTagBody), - ("html", endTagHtml), - (("address", "article", "aside", "blockquote", "button", "center", - "details", "dialog", "dir", "div", "dl", "fieldset", "figcaption", "figure", - "footer", "header", "hgroup", "listing", "main", "menu", "nav", "ol", "pre", - "section", "summary", "ul"), endTagBlock), - ("form", endTagForm), - ("p", endTagP), - (("dd", "dt", "li"), endTagListItem), - (headingElements, endTagHeading), - (("a", "b", "big", "code", "em", "font", "i", "nobr", "s", "small", - "strike", "strong", "tt", "u"), endTagFormatting), - (("applet", "marquee", "object"), endTagAppletMarqueeObject), - ("br", endTagBr), - ]) - endTagHandler.default = endTagOther - - class TextPhase(Phase): - __slots__ = tuple() - - def processCharacters(self, token): - self.tree.insertText(token["data"]) - - def processEOF(self): - self.parser.parseError("expected-named-closing-tag-but-got-eof", - {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - self.parser.phase = self.parser.originalPhase - return True + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def startTagOther(self, token): - assert False, "Tried to process start tag %s in RCDATA/RAWTEXT mode" % token['name'] + def endTagOther(self, token): + return self.parser.phases["inBody"].processEndTag(token) - def endTagScript(self, token): - node = self.tree.openElements.pop() - assert node.name == "script" - self.parser.phase = self.parser.originalPhase - # The rest of this method is all stuff that only happens if - # document.write works + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), startTagTableElement) + ]) + startTagHandler.default = startTagOther - def endTagOther(self, token): - self.tree.openElements.pop() - self.parser.phase = self.parser.originalPhase - - startTagHandler = _utils.MethodDispatcher([]) - startTagHandler.default = startTagOther - endTagHandler = _utils.MethodDispatcher([ - ("script", endTagScript)]) - endTagHandler.default = endTagOther - - class InTablePhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-table - __slots__ = tuple() - - # helper methods - def clearStackToTableContext(self): - # "clear the stack back to a table context" - while self.tree.openElements[-1].name not in ("table", "html"): - # self.parser.parseError("unexpected-implied-end-tag-in-table", - # {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - # When the current node is <html> it's an innerHTML case + endTagHandler = _utils.MethodDispatcher([ + ("caption", endTagCaption), + ("table", endTagTable), + (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", + "thead", "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther - # processing methods - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-table") - else: - assert self.parser.innerHTML - # Stop parsing - - def processSpaceCharacters(self, token): - originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["inTableText"] - self.parser.phase.originalPhase = originalPhase - self.parser.phase.processSpaceCharacters(token) - - def processCharacters(self, token): - originalPhase = self.parser.phase - self.parser.phase = self.parser.phases["inTableText"] - self.parser.phase.originalPhase = originalPhase - self.parser.phase.processCharacters(token) - - def insertText(self, token): - # If we get here there must be at least one non-whitespace character - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processCharacters(token) - self.tree.insertFromTable = False - - def startTagCaption(self, token): - self.clearStackToTableContext() - self.tree.activeFormattingElements.append(Marker) - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inCaption"] - def startTagColgroup(self, token): - self.clearStackToTableContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inColumnGroup"] +class InColumnGroupPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-column + __slots__ = tuple() - def startTagCol(self, token): - self.startTagColgroup(impliedTagToken("colgroup", "StartTag")) - return token + def ignoreEndTagColgroup(self): + return self.tree.openElements[-1].name == "html" - def startTagRowGroup(self, token): - self.clearStackToTableContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inTableBody"] + def processEOF(self): + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML + return + else: + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: + return True - def startTagImplyTbody(self, token): - self.startTagRowGroup(impliedTagToken("tbody", "StartTag")) + def processCharacters(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: return token - def startTagTable(self, token): - self.parser.parseError("unexpected-start-tag-implies-end-tag", - {"startName": "table", "endName": "table"}) - self.parser.phase.processEndTag(impliedTagToken("table")) - if not self.parser.innerHTML: - return token - - def startTagStyleScript(self, token): - return self.parser.phases["inHead"].processStartTag(token) - - def startTagInput(self, token): - if ("type" in token["data"] and - token["data"]["type"].translate(asciiUpper2Lower) == "hidden"): - self.parser.parseError("unexpected-hidden-input-in-table") - self.tree.insertElement(token) - # XXX associate with form - self.tree.openElements.pop() - else: - self.startTagOther(token) - - def startTagForm(self, token): - self.parser.parseError("unexpected-form-in-table") - if self.tree.formPointer is None: - self.tree.insertElement(token) - self.tree.formPointer = self.tree.openElements[-1] - self.tree.openElements.pop() - - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-implies-table-voodoo", {"name": token["name"]}) - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processStartTag(token) - self.tree.insertFromTable = False - - def endTagTable(self, token): - if self.tree.elementInScope("table", variant="table"): - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "table": - self.parser.parseError("end-tag-too-early-named", - {"gotName": "table", - "expectedName": self.tree.openElements[-1].name}) - while self.tree.openElements[-1].name != "table": - self.tree.openElements.pop() - self.tree.openElements.pop() - self.parser.resetInsertionMode() - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) + def startTagCol(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-implies-table-voodoo", {"name": token["name"]}) - # Do the table magic! - self.tree.insertFromTable = True - self.parser.phases["inBody"].processEndTag(token) - self.tree.insertFromTable = False - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("caption", startTagCaption), - ("colgroup", startTagColgroup), - ("col", startTagCol), - (("tbody", "tfoot", "thead"), startTagRowGroup), - (("td", "th", "tr"), startTagImplyTbody), - ("table", startTagTable), - (("style", "script"), startTagStyleScript), - ("input", startTagInput), - ("form", startTagForm) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("table", endTagTable), - (("body", "caption", "col", "colgroup", "html", "tbody", "td", - "tfoot", "th", "thead", "tr"), endTagIgnore) - ]) - endTagHandler.default = endTagOther - - class InTableTextPhase(Phase): - __slots__ = ("originalPhase", "characterTokens") - - def __init__(self, *args, **kwargs): - super(InTableTextPhase, self).__init__(*args, **kwargs) - self.originalPhase = None - self.characterTokens = [] - - def flushCharacters(self): - data = "".join([item["data"] for item in self.characterTokens]) - if any(item not in spaceCharacters for item in data): - token = {"type": tokenTypes["Characters"], "data": data} - self.parser.phases["inTable"].insertText(token) - elif data: - self.tree.insertText(data) - self.characterTokens = [] - - def processComment(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase + def startTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: return token - def processEOF(self): - self.flushCharacters() - self.parser.phase = self.originalPhase - return True - - def processCharacters(self, token): - if token["data"] == "\u0000": - return - self.characterTokens.append(token) - - def processSpaceCharacters(self, token): - # pretty sure we should never reach here - self.characterTokens.append(token) - # assert False + def endTagColgroup(self, token): + if self.ignoreEndTagColgroup(): + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() + else: + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] - def processStartTag(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase - return token + def endTagCol(self, token): + self.parser.parseError("no-end-tag", {"name": "col"}) - def processEndTag(self, token): - self.flushCharacters() - self.parser.phase = self.originalPhase + def endTagOther(self, token): + ignoreEndTag = self.ignoreEndTagColgroup() + self.endTagColgroup(impliedTagToken("colgroup")) + if not ignoreEndTag: return token - class InCaptionPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-caption - __slots__ = tuple() + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("col", startTagCol) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("colgroup", endTagColgroup), + ("col", endTagCol) + ]) + endTagHandler.default = endTagOther + + +class InTableBodyPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 + __slots__ = tuple() + + # helper methods + def clearStackToTableBodyContext(self): + while self.tree.openElements[-1].name not in ("tbody", "tfoot", + "thead", "html"): + # self.parser.parseError("unexpected-implied-end-tag-in-table", + # {"name": self.tree.openElements[-1].name}) + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "html": + assert self.parser.innerHTML - def ignoreEndTagCaption(self): - return not self.tree.elementInScope("caption", variant="table") + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() - def processEOF(self): - self.parser.phases["inBody"].processEOF() + def processSpaceCharacters(self, token): + return self.parser.phases["inTable"].processSpaceCharacters(token) - def processCharacters(self, token): - return self.parser.phases["inBody"].processCharacters(token) + def processCharacters(self, token): + return self.parser.phases["inTable"].processCharacters(token) - def startTagTableElement(self, token): + def startTagTr(self, token): + self.clearStackToTableBodyContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inRow"] + + def startTagTableCell(self, token): + self.parser.parseError("unexpected-cell-in-table-body", + {"name": token["name"]}) + self.startTagTr(impliedTagToken("tr", "StartTag")) + return token + + def startTagTableOther(self, token): + # XXX AT Any ideas on how to share this with endTagTable? + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + return token + else: + # innerHTML case + assert self.parser.innerHTML self.parser.parseError() - # XXX Have to duplicate logic here to find out if the tag is ignored - ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag(impliedTagToken("caption")) - if not ignoreEndTag: - return token - def startTagOther(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagOther(self, token): + return self.parser.phases["inTable"].processStartTag(token) - def endTagCaption(self, token): - if not self.ignoreEndTagCaption(): - # AT this code is quite similar to endTagTable in "InTable" - self.tree.generateImpliedEndTags() - if self.tree.openElements[-1].name != "caption": - self.parser.parseError("expected-one-end-tag-but-got-another", - {"gotName": "caption", - "expectedName": self.tree.openElements[-1].name}) - while self.tree.openElements[-1].name != "caption": - self.tree.openElements.pop() - self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - self.parser.phase = self.parser.phases["inTable"] - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.clearStackToTableBodyContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTable"] + else: + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) - def endTagTable(self, token): + def endTagTable(self, token): + if (self.tree.elementInScope("tbody", variant="table") or + self.tree.elementInScope("thead", variant="table") or + self.tree.elementInScope("tfoot", variant="table")): + self.clearStackToTableBodyContext() + self.endTagTableRowGroup( + impliedTagToken(self.tree.openElements[-1].name)) + return token + else: + # innerHTML case + assert self.parser.innerHTML self.parser.parseError() - ignoreEndTag = self.ignoreEndTagCaption() - self.parser.phase.processEndTag(impliedTagToken("caption")) - if not ignoreEndTag: - return token - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - def endTagOther(self, token): - return self.parser.phases["inBody"].processEndTag(token) - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), startTagTableElement) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("caption", endTagCaption), - ("table", endTagTable), - (("body", "col", "colgroup", "html", "tbody", "td", "tfoot", "th", - "thead", "tr"), endTagIgnore) - ]) - endTagHandler.default = endTagOther - - class InColumnGroupPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-column - __slots__ = tuple() - - def ignoreEndTagColgroup(self): - return self.tree.openElements[-1].name == "html" - - def processEOF(self): - if self.tree.openElements[-1].name == "html": - assert self.parser.innerHTML - return - else: - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return True - def processCharacters(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token - - def startTagCol(self, token): - self.tree.insertElement(token) + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-body", + {"name": token["name"]}) + + def endTagOther(self, token): + return self.parser.phases["inTable"].processEndTag(token) + + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("tr", startTagTr), + (("td", "th"), startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), + startTagTableOther) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("tbody", "tfoot", "thead"), endTagTableRowGroup), + ("table", endTagTable), + (("body", "caption", "col", "colgroup", "html", "td", "th", + "tr"), endTagIgnore) + ]) + endTagHandler.default = endTagOther + + +class InRowPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-row + __slots__ = tuple() + + # helper methods (XXX unify this with other table helper methods) + def clearStackToTableRowContext(self): + while self.tree.openElements[-1].name not in ("tr", "html"): + self.parser.parseError("unexpected-implied-end-tag-in-table-row", + {"name": self.tree.openElements[-1].name}) self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - def startTagOther(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token + def ignoreEndTagTr(self): + return not self.tree.elementInScope("tr", variant="table") - def endTagColgroup(self, token): - if self.ignoreEndTagColgroup(): - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - else: - self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTable"] + # the rest + def processEOF(self): + self.parser.phases["inTable"].processEOF() - def endTagCol(self, token): - self.parser.parseError("no-end-tag", {"name": "col"}) + def processSpaceCharacters(self, token): + return self.parser.phases["inTable"].processSpaceCharacters(token) - def endTagOther(self, token): - ignoreEndTag = self.ignoreEndTagColgroup() - self.endTagColgroup(impliedTagToken("colgroup")) - if not ignoreEndTag: - return token - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("col", startTagCol) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("colgroup", endTagColgroup), - ("col", endTagCol) - ]) - endTagHandler.default = endTagOther - - class InTableBodyPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-table0 - __slots__ = tuple() - - # helper methods - def clearStackToTableBodyContext(self): - while self.tree.openElements[-1].name not in ("tbody", "tfoot", - "thead", "html"): - # self.parser.parseError("unexpected-implied-end-tag-in-table", - # {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - if self.tree.openElements[-1].name == "html": - assert self.parser.innerHTML + def processCharacters(self, token): + return self.parser.phases["inTable"].processCharacters(token) - # the rest - def processEOF(self): - self.parser.phases["inTable"].processEOF() + def startTagTableCell(self, token): + self.clearStackToTableRowContext() + self.tree.insertElement(token) + self.parser.phase = self.parser.phases["inCell"] + self.tree.activeFormattingElements.append(Marker) + + def startTagTableOther(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr(impliedTagToken("tr")) + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + return token - def processSpaceCharacters(self, token): - return self.parser.phases["inTable"].processSpaceCharacters(token) + def startTagOther(self, token): + return self.parser.phases["inTable"].processStartTag(token) - def processCharacters(self, token): - return self.parser.phases["inTable"].processCharacters(token) + def endTagTr(self, token): + if not self.ignoreEndTagTr(): + self.clearStackToTableRowContext() + self.tree.openElements.pop() + self.parser.phase = self.parser.phases["inTableBody"] + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() - def startTagTr(self, token): - self.clearStackToTableBodyContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inRow"] + def endTagTable(self, token): + ignoreEndTag = self.ignoreEndTagTr() + self.endTagTr(impliedTagToken("tr")) + # Reprocess the current tag if the tr end tag was not ignored + # XXX how are we sure it's always ignored in the innerHTML case? + if not ignoreEndTag: + return token - def startTagTableCell(self, token): - self.parser.parseError("unexpected-cell-in-table-body", - {"name": token["name"]}) - self.startTagTr(impliedTagToken("tr", "StartTag")) + def endTagTableRowGroup(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagTr(impliedTagToken("tr")) return token + else: + self.parser.parseError() - def startTagTableOther(self, token): - # XXX AT Any ideas on how to share this with endTagTable? - if (self.tree.elementInScope("tbody", variant="table") or - self.tree.elementInScope("thead", variant="table") or - self.tree.elementInScope("tfoot", variant="table")): - self.clearStackToTableBodyContext() - self.endTagTableRowGroup( - impliedTagToken(self.tree.openElements[-1].name)) - return token - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag-in-table-row", + {"name": token["name"]}) + + def endTagOther(self, token): + return self.parser.phases["inTable"].processEndTag(token) + + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("td", "th"), startTagTableCell), + (("caption", "col", "colgroup", "tbody", "tfoot", "thead", + "tr"), startTagTableOther) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("tr", endTagTr), + ("table", endTagTable), + (("tbody", "tfoot", "thead"), endTagTableRowGroup), + (("body", "caption", "col", "colgroup", "html", "td", "th"), + endTagIgnore) + ]) + endTagHandler.default = endTagOther + + +class InCellPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-cell + __slots__ = tuple() + + # helper + def closeCell(self): + if self.tree.elementInScope("td", variant="table"): + self.endTagTableCell(impliedTagToken("td")) + elif self.tree.elementInScope("th", variant="table"): + self.endTagTableCell(impliedTagToken("th")) + + # the rest + def processEOF(self): + self.parser.phases["inBody"].processEOF() + + def processCharacters(self, token): + return self.parser.phases["inBody"].processCharacters(token) + + def startTagTableOther(self, token): + if (self.tree.elementInScope("td", variant="table") or + self.tree.elementInScope("th", variant="table")): + self.closeCell() + return token + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() - def startTagOther(self, token): - return self.parser.phases["inTable"].processStartTag(token) + def startTagOther(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def endTagTableRowGroup(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.clearStackToTableBodyContext() - self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTable"] - else: - self.parser.parseError("unexpected-end-tag-in-table-body", + def endTagTableCell(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.tree.generateImpliedEndTags(token["name"]) + if self.tree.openElements[-1].name != token["name"]: + self.parser.parseError("unexpected-cell-end-tag", {"name": token["name"]}) - - def endTagTable(self, token): - if (self.tree.elementInScope("tbody", variant="table") or - self.tree.elementInScope("thead", variant="table") or - self.tree.elementInScope("tfoot", variant="table")): - self.clearStackToTableBodyContext() - self.endTagTableRowGroup( - impliedTagToken(self.tree.openElements[-1].name)) - return token + while True: + node = self.tree.openElements.pop() + if node.name == token["name"]: + break else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag-in-table-body", - {"name": token["name"]}) - - def endTagOther(self, token): - return self.parser.phases["inTable"].processEndTag(token) - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("tr", startTagTr), - (("td", "th"), startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead"), - startTagTableOther) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - (("tbody", "tfoot", "thead"), endTagTableRowGroup), - ("table", endTagTable), - (("body", "caption", "col", "colgroup", "html", "td", "th", - "tr"), endTagIgnore) - ]) - endTagHandler.default = endTagOther - - class InRowPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-row - __slots__ = tuple() - - # helper methods (XXX unify this with other table helper methods) - def clearStackToTableRowContext(self): - while self.tree.openElements[-1].name not in ("tr", "html"): - self.parser.parseError("unexpected-implied-end-tag-in-table-row", - {"name": self.tree.openElements[-1].name}) - self.tree.openElements.pop() - - def ignoreEndTagTr(self): - return not self.tree.elementInScope("tr", variant="table") - - # the rest - def processEOF(self): - self.parser.phases["inTable"].processEOF() - - def processSpaceCharacters(self, token): - return self.parser.phases["inTable"].processSpaceCharacters(token) - - def processCharacters(self, token): - return self.parser.phases["inTable"].processCharacters(token) - - def startTagTableCell(self, token): - self.clearStackToTableRowContext() - self.tree.insertElement(token) - self.parser.phase = self.parser.phases["inCell"] - self.tree.activeFormattingElements.append(Marker) - - def startTagTableOther(self, token): - ignoreEndTag = self.ignoreEndTagTr() - self.endTagTr(impliedTagToken("tr")) - # XXX how are we sure it's always ignored in the innerHTML case? - if not ignoreEndTag: - return token - - def startTagOther(self, token): - return self.parser.phases["inTable"].processStartTag(token) - - def endTagTr(self, token): - if not self.ignoreEndTagTr(): - self.clearStackToTableRowContext() self.tree.openElements.pop() - self.parser.phase = self.parser.phases["inTableBody"] - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() + self.tree.clearActiveFormattingElements() + self.parser.phase = self.parser.phases["inRow"] + else: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagTable(self, token): - ignoreEndTag = self.ignoreEndTagTr() - self.endTagTr(impliedTagToken("tr")) - # Reprocess the current tag if the tr end tag was not ignored - # XXX how are we sure it's always ignored in the innerHTML case? - if not ignoreEndTag: - return token + def endTagIgnore(self, token): + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagTableRowGroup(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.endTagTr(impliedTagToken("tr")) - return token - else: - self.parser.parseError() + def endTagImply(self, token): + if self.tree.elementInScope(token["name"], variant="table"): + self.closeCell() + return token + else: + # sometimes innerHTML case + self.parser.parseError() - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag-in-table-row", - {"name": token["name"]}) + def endTagOther(self, token): + return self.parser.phases["inBody"].processEndTag(token) - def endTagOther(self, token): - return self.parser.phases["inTable"].processEndTag(token) - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - (("td", "th"), startTagTableCell), - (("caption", "col", "colgroup", "tbody", "tfoot", "thead", - "tr"), startTagTableOther) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("tr", endTagTr), - ("table", endTagTable), - (("tbody", "tfoot", "thead"), endTagTableRowGroup), - (("body", "caption", "col", "colgroup", "html", "td", "th"), - endTagIgnore) - ]) - endTagHandler.default = endTagOther - - class InCellPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-cell - __slots__ = tuple() - - # helper - def closeCell(self): - if self.tree.elementInScope("td", variant="table"): - self.endTagTableCell(impliedTagToken("td")) - elif self.tree.elementInScope("th", variant="table"): - self.endTagTableCell(impliedTagToken("th")) - - # the rest - def processEOF(self): - self.parser.phases["inBody"].processEOF() - - def processCharacters(self, token): - return self.parser.phases["inBody"].processCharacters(token) - - def startTagTableOther(self, token): - if (self.tree.elementInScope("td", variant="table") or - self.tree.elementInScope("th", variant="table")): - self.closeCell() - return token - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", + "thead", "tr"), startTagTableOther) + ]) + startTagHandler.default = startTagOther - def startTagOther(self, token): - return self.parser.phases["inBody"].processStartTag(token) + endTagHandler = _utils.MethodDispatcher([ + (("td", "th"), endTagTableCell), + (("body", "caption", "col", "colgroup", "html"), endTagIgnore), + (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) + ]) + endTagHandler.default = endTagOther - def endTagTableCell(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.tree.generateImpliedEndTags(token["name"]) - if self.tree.openElements[-1].name != token["name"]: - self.parser.parseError("unexpected-cell-end-tag", - {"name": token["name"]}) - while True: - node = self.tree.openElements.pop() - if node.name == token["name"]: - break - else: - self.tree.openElements.pop() - self.tree.clearActiveFormattingElements() - self.parser.phase = self.parser.phases["inRow"] - else: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - def endTagIgnore(self, token): - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) +class InSelectPhase(Phase): + __slots__ = tuple() - def endTagImply(self, token): - if self.tree.elementInScope(token["name"], variant="table"): - self.closeCell() - return token - else: - # sometimes innerHTML case - self.parser.parseError() + # http://www.whatwg.org/specs/web-apps/current-work/#in-select + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-select") + else: + assert self.parser.innerHTML - def endTagOther(self, token): - return self.parser.phases["inBody"].processEndTag(token) - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - (("caption", "col", "colgroup", "tbody", "td", "tfoot", "th", - "thead", "tr"), startTagTableOther) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - (("td", "th"), endTagTableCell), - (("body", "caption", "col", "colgroup", "html"), endTagIgnore), - (("table", "tbody", "tfoot", "thead", "tr"), endTagImply) - ]) - endTagHandler.default = endTagOther - - class InSelectPhase(Phase): - __slots__ = tuple() - - # http://www.whatwg.org/specs/web-apps/current-work/#in-select - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-select") - else: - assert self.parser.innerHTML + def processCharacters(self, token): + if token["data"] == "\u0000": + return + self.tree.insertText(token["data"]) - def processCharacters(self, token): - if token["data"] == "\u0000": - return - self.tree.insertText(token["data"]) + def startTagOption(self, token): + # We need to imply </option> if <option> is the current node. + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + self.tree.insertElement(token) - def startTagOption(self, token): - # We need to imply </option> if <option> is the current node. - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - self.tree.insertElement(token) + def startTagOptgroup(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + self.tree.insertElement(token) - def startTagOptgroup(self, token): - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - if self.tree.openElements[-1].name == "optgroup": - self.tree.openElements.pop() - self.tree.insertElement(token) + def startTagSelect(self, token): + self.parser.parseError("unexpected-select-in-select") + self.endTagSelect(impliedTagToken("select")) - def startTagSelect(self, token): - self.parser.parseError("unexpected-select-in-select") + def startTagInput(self, token): + self.parser.parseError("unexpected-input-in-select") + if self.tree.elementInScope("select", variant="select"): self.endTagSelect(impliedTagToken("select")) + return token + else: + assert self.parser.innerHTML - def startTagInput(self, token): - self.parser.parseError("unexpected-input-in-select") - if self.tree.elementInScope("select", variant="select"): - self.endTagSelect(impliedTagToken("select")) - return token - else: - assert self.parser.innerHTML - - def startTagScript(self, token): - return self.parser.phases["inHead"].processStartTag(token) + def startTagScript(self, token): + return self.parser.phases["inHead"].processStartTag(token) - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-in-select", - {"name": token["name"]}) + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-select", + {"name": token["name"]}) - def endTagOption(self, token): - if self.tree.openElements[-1].name == "option": - self.tree.openElements.pop() - else: - self.parser.parseError("unexpected-end-tag-in-select", - {"name": "option"}) + def endTagOption(self, token): + if self.tree.openElements[-1].name == "option": + self.tree.openElements.pop() + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "option"}) - def endTagOptgroup(self, token): - # </optgroup> implicitly closes <option> - if (self.tree.openElements[-1].name == "option" and - self.tree.openElements[-2].name == "optgroup"): - self.tree.openElements.pop() - # It also closes </optgroup> - if self.tree.openElements[-1].name == "optgroup": - self.tree.openElements.pop() - # But nothing else - else: - self.parser.parseError("unexpected-end-tag-in-select", - {"name": "optgroup"}) + def endTagOptgroup(self, token): + # </optgroup> implicitly closes <option> + if (self.tree.openElements[-1].name == "option" and + self.tree.openElements[-2].name == "optgroup"): + self.tree.openElements.pop() + # It also closes </optgroup> + if self.tree.openElements[-1].name == "optgroup": + self.tree.openElements.pop() + # But nothing else + else: + self.parser.parseError("unexpected-end-tag-in-select", + {"name": "optgroup"}) - def endTagSelect(self, token): - if self.tree.elementInScope("select", variant="select"): + def endTagSelect(self, token): + if self.tree.elementInScope("select", variant="select"): + node = self.tree.openElements.pop() + while node.name != "select": node = self.tree.openElements.pop() - while node.name != "select": - node = self.tree.openElements.pop() - self.parser.resetInsertionMode() - else: - # innerHTML case - assert self.parser.innerHTML - self.parser.parseError() - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-in-select", - {"name": token["name"]}) + self.parser.resetInsertionMode() + else: + # innerHTML case + assert self.parser.innerHTML + self.parser.parseError() - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("option", startTagOption), - ("optgroup", startTagOptgroup), - ("select", startTagSelect), - (("input", "keygen", "textarea"), startTagInput), - ("script", startTagScript) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("option", endTagOption), - ("optgroup", endTagOptgroup), - ("select", endTagSelect) - ]) - endTagHandler.default = endTagOther - - class InSelectInTablePhase(Phase): - __slots__ = tuple() - - def processEOF(self): - self.parser.phases["inSelect"].processEOF() - - def processCharacters(self, token): - return self.parser.phases["inSelect"].processCharacters(token) - - def startTagTable(self, token): - self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) - self.endTagOther(impliedTagToken("select")) - return token + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-select", + {"name": token["name"]}) - def startTagOther(self, token): - return self.parser.phases["inSelect"].processStartTag(token) - - def endTagTable(self, token): - self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) - if self.tree.elementInScope(token["name"], variant="table"): - self.endTagOther(impliedTagToken("select")) - return token - - def endTagOther(self, token): - return self.parser.phases["inSelect"].processEndTag(token) - - startTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - startTagTable) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), - endTagTable) - ]) - endTagHandler.default = endTagOther - - class InForeignContentPhase(Phase): - __slots__ = tuple() - - breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", - "center", "code", "dd", "div", "dl", "dt", - "em", "embed", "h1", "h2", "h3", - "h4", "h5", "h6", "head", "hr", "i", "img", - "li", "listing", "menu", "meta", "nobr", - "ol", "p", "pre", "ruby", "s", "small", - "span", "strong", "strike", "sub", "sup", - "table", "tt", "u", "ul", "var"]) - - def adjustSVGTagNames(self, token): - replacements = {"altglyph": "altGlyph", - "altglyphdef": "altGlyphDef", - "altglyphitem": "altGlyphItem", - "animatecolor": "animateColor", - "animatemotion": "animateMotion", - "animatetransform": "animateTransform", - "clippath": "clipPath", - "feblend": "feBlend", - "fecolormatrix": "feColorMatrix", - "fecomponenttransfer": "feComponentTransfer", - "fecomposite": "feComposite", - "feconvolvematrix": "feConvolveMatrix", - "fediffuselighting": "feDiffuseLighting", - "fedisplacementmap": "feDisplacementMap", - "fedistantlight": "feDistantLight", - "feflood": "feFlood", - "fefunca": "feFuncA", - "fefuncb": "feFuncB", - "fefuncg": "feFuncG", - "fefuncr": "feFuncR", - "fegaussianblur": "feGaussianBlur", - "feimage": "feImage", - "femerge": "feMerge", - "femergenode": "feMergeNode", - "femorphology": "feMorphology", - "feoffset": "feOffset", - "fepointlight": "fePointLight", - "fespecularlighting": "feSpecularLighting", - "fespotlight": "feSpotLight", - "fetile": "feTile", - "feturbulence": "feTurbulence", - "foreignobject": "foreignObject", - "glyphref": "glyphRef", - "lineargradient": "linearGradient", - "radialgradient": "radialGradient", - "textpath": "textPath"} - - if token["name"] in replacements: - token["name"] = replacements[token["name"]] - - def processCharacters(self, token): - if token["data"] == "\u0000": - token["data"] = "\uFFFD" - elif (self.parser.framesetOK and - any(char not in spaceCharacters for char in token["data"])): - self.parser.framesetOK = False - Phase.processCharacters(self, token) - - def processStartTag(self, token): - currentNode = self.tree.openElements[-1] - if (token["name"] in self.breakoutElements or - (token["name"] == "font" and - set(token["data"].keys()) & {"color", "face", "size"})): - self.parser.parseError("unexpected-html-element-in-foreign-content", - {"name": token["name"]}) - while (self.tree.openElements[-1].namespace != - self.tree.defaultNamespace and - not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and - not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): - self.tree.openElements.pop() - return token + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("option", startTagOption), + ("optgroup", startTagOptgroup), + ("select", startTagSelect), + (("input", "keygen", "textarea"), startTagInput), + ("script", startTagScript) + ]) + startTagHandler.default = startTagOther - else: - if currentNode.namespace == namespaces["mathml"]: - self.parser.adjustMathMLAttributes(token) - elif currentNode.namespace == namespaces["svg"]: - self.adjustSVGTagNames(token) - self.parser.adjustSVGAttributes(token) - self.parser.adjustForeignAttributes(token) - token["namespace"] = currentNode.namespace - self.tree.insertElement(token) - if token["selfClosing"]: - self.tree.openElements.pop() - token["selfClosingAcknowledged"] = True - - def processEndTag(self, token): - nodeIndex = len(self.tree.openElements) - 1 - node = self.tree.openElements[-1] - if node.name.translate(asciiUpper2Lower) != token["name"]: - self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - - while True: - if node.name.translate(asciiUpper2Lower) == token["name"]: - # XXX this isn't in the spec but it seems necessary - if self.parser.phase == self.parser.phases["inTableText"]: - self.parser.phase.flushCharacters() - self.parser.phase = self.parser.phase.originalPhase - while self.tree.openElements.pop() != node: - assert self.tree.openElements - new_token = None - break - nodeIndex -= 1 + endTagHandler = _utils.MethodDispatcher([ + ("option", endTagOption), + ("optgroup", endTagOptgroup), + ("select", endTagSelect) + ]) + endTagHandler.default = endTagOther - node = self.tree.openElements[nodeIndex] - if node.namespace != self.tree.defaultNamespace: - continue - else: - new_token = self.parser.phase.processEndTag(token) - break - return new_token - class AfterBodyPhase(Phase): - __slots__ = tuple() +class InSelectInTablePhase(Phase): + __slots__ = tuple() - def processEOF(self): - # Stop parsing - pass + def processEOF(self): + self.parser.phases["inSelect"].processEOF() - def processComment(self, token): - # This is needed because data is to be appended to the <html> element - # here and not to whatever is currently open. - self.tree.insertComment(token, self.tree.openElements[0]) + def processCharacters(self, token): + return self.parser.phases["inSelect"].processCharacters(token) - def processCharacters(self, token): - self.parser.parseError("unexpected-char-after-body") - self.parser.phase = self.parser.phases["inBody"] - return token + def startTagTable(self, token): + self.parser.parseError("unexpected-table-element-start-tag-in-select-in-table", {"name": token["name"]}) + self.endTagOther(impliedTagToken("select")) + return token - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagOther(self, token): + return self.parser.phases["inSelect"].processStartTag(token) - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-after-body", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] + def endTagTable(self, token): + self.parser.parseError("unexpected-table-element-end-tag-in-select-in-table", {"name": token["name"]}) + if self.tree.elementInScope(token["name"], variant="table"): + self.endTagOther(impliedTagToken("select")) return token - def endTagHtml(self, name): - if self.parser.innerHTML: - self.parser.parseError("unexpected-end-tag-after-body-innerhtml") - else: - self.parser.phase = self.parser.phases["afterAfterBody"] - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-after-body", + def endTagOther(self, token): + return self.parser.phases["inSelect"].processEndTag(token) + + startTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + startTagTable) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + (("caption", "table", "tbody", "tfoot", "thead", "tr", "td", "th"), + endTagTable) + ]) + endTagHandler.default = endTagOther + + +class InForeignContentPhase(Phase): + __slots__ = tuple() + + breakoutElements = frozenset(["b", "big", "blockquote", "body", "br", + "center", "code", "dd", "div", "dl", "dt", + "em", "embed", "h1", "h2", "h3", + "h4", "h5", "h6", "head", "hr", "i", "img", + "li", "listing", "menu", "meta", "nobr", + "ol", "p", "pre", "ruby", "s", "small", + "span", "strong", "strike", "sub", "sup", + "table", "tt", "u", "ul", "var"]) + + def adjustSVGTagNames(self, token): + replacements = {"altglyph": "altGlyph", + "altglyphdef": "altGlyphDef", + "altglyphitem": "altGlyphItem", + "animatecolor": "animateColor", + "animatemotion": "animateMotion", + "animatetransform": "animateTransform", + "clippath": "clipPath", + "feblend": "feBlend", + "fecolormatrix": "feColorMatrix", + "fecomponenttransfer": "feComponentTransfer", + "fecomposite": "feComposite", + "feconvolvematrix": "feConvolveMatrix", + "fediffuselighting": "feDiffuseLighting", + "fedisplacementmap": "feDisplacementMap", + "fedistantlight": "feDistantLight", + "feflood": "feFlood", + "fefunca": "feFuncA", + "fefuncb": "feFuncB", + "fefuncg": "feFuncG", + "fefuncr": "feFuncR", + "fegaussianblur": "feGaussianBlur", + "feimage": "feImage", + "femerge": "feMerge", + "femergenode": "feMergeNode", + "femorphology": "feMorphology", + "feoffset": "feOffset", + "fepointlight": "fePointLight", + "fespecularlighting": "feSpecularLighting", + "fespotlight": "feSpotLight", + "fetile": "feTile", + "feturbulence": "feTurbulence", + "foreignobject": "foreignObject", + "glyphref": "glyphRef", + "lineargradient": "linearGradient", + "radialgradient": "radialGradient", + "textpath": "textPath"} + + if token["name"] in replacements: + token["name"] = replacements[token["name"]] + + def processCharacters(self, token): + if token["data"] == "\u0000": + token["data"] = "\uFFFD" + elif (self.parser.framesetOK and + any(char not in spaceCharacters for char in token["data"])): + self.parser.framesetOK = False + Phase.processCharacters(self, token) + + def processStartTag(self, token): + currentNode = self.tree.openElements[-1] + if (token["name"] in self.breakoutElements or + (token["name"] == "font" and + set(token["data"].keys()) & {"color", "face", "size"})): + self.parser.parseError("unexpected-html-element-in-foreign-content", {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] + while (self.tree.openElements[-1].namespace != + self.tree.defaultNamespace and + not self.parser.isHTMLIntegrationPoint(self.tree.openElements[-1]) and + not self.parser.isMathMLTextIntegrationPoint(self.tree.openElements[-1])): + self.tree.openElements.pop() return token - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml) - ]) - startTagHandler.default = startTagOther + else: + if currentNode.namespace == namespaces["mathml"]: + self.parser.adjustMathMLAttributes(token) + elif currentNode.namespace == namespaces["svg"]: + self.adjustSVGTagNames(token) + self.parser.adjustSVGAttributes(token) + self.parser.adjustForeignAttributes(token) + token["namespace"] = currentNode.namespace + self.tree.insertElement(token) + if token["selfClosing"]: + self.tree.openElements.pop() + token["selfClosingAcknowledged"] = True - endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) - endTagHandler.default = endTagOther + def processEndTag(self, token): + nodeIndex = len(self.tree.openElements) - 1 + node = self.tree.openElements[-1] + if node.name.translate(asciiUpper2Lower) != token["name"]: + self.parser.parseError("unexpected-end-tag", {"name": token["name"]}) - class InFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset - __slots__ = tuple() + while True: + if node.name.translate(asciiUpper2Lower) == token["name"]: + # XXX this isn't in the spec but it seems necessary + if self.parser.phase == self.parser.phases["inTableText"]: + self.parser.phase.flushCharacters() + self.parser.phase = self.parser.phase.originalPhase + while self.tree.openElements.pop() != node: + assert self.tree.openElements + new_token = None + break + nodeIndex -= 1 - def processEOF(self): - if self.tree.openElements[-1].name != "html": - self.parser.parseError("eof-in-frameset") + node = self.tree.openElements[nodeIndex] + if node.namespace != self.tree.defaultNamespace: + continue else: - assert self.parser.innerHTML + new_token = self.parser.phase.processEndTag(token) + break + return new_token - def processCharacters(self, token): - self.parser.parseError("unexpected-char-in-frameset") - def startTagFrameset(self, token): - self.tree.insertElement(token) +class AfterBodyPhase(Phase): + __slots__ = tuple() - def startTagFrame(self, token): - self.tree.insertElement(token) - self.tree.openElements.pop() + def processEOF(self): + # Stop parsing + pass - def startTagNoframes(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def processComment(self, token): + # This is needed because data is to be appended to the <html> element + # here and not to whatever is currently open. + self.tree.insertComment(token, self.tree.openElements[0]) - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-in-frameset", - {"name": token["name"]}) + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-body") + self.parser.phase = self.parser.phases["inBody"] + return token - def endTagFrameset(self, token): - if self.tree.openElements[-1].name == "html": - # innerHTML case - self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") - else: - self.tree.openElements.pop() - if (not self.parser.innerHTML and - self.tree.openElements[-1].name != "frameset"): - # If we're not in innerHTML mode and the current node is not a - # "frameset" element (anymore) then switch. - self.parser.phase = self.parser.phases["afterFrameset"] - - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-in-frameset", - {"name": token["name"]}) - - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("frameset", startTagFrameset), - ("frame", startTagFrame), - ("noframes", startTagNoframes) - ]) - startTagHandler.default = startTagOther - - endTagHandler = _utils.MethodDispatcher([ - ("frameset", endTagFrameset) - ]) - endTagHandler.default = endTagOther - - class AfterFramesetPhase(Phase): - # http://www.whatwg.org/specs/web-apps/current-work/#after3 - __slots__ = tuple() - - def processEOF(self): - # Stop parsing - pass + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def processCharacters(self, token): - self.parser.parseError("unexpected-char-after-frameset") + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token - def startTagNoframes(self, token): - return self.parser.phases["inHead"].processStartTag(token) + def endTagHtml(self, name): + if self.parser.innerHTML: + self.parser.parseError("unexpected-end-tag-after-body-innerhtml") + else: + self.parser.phase = self.parser.phases["afterAfterBody"] - def startTagOther(self, token): - self.parser.parseError("unexpected-start-tag-after-frameset", - {"name": token["name"]}) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-body", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token - def endTagHtml(self, token): - self.parser.phase = self.parser.phases["afterAfterFrameset"] + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml) + ]) + startTagHandler.default = startTagOther - def endTagOther(self, token): - self.parser.parseError("unexpected-end-tag-after-frameset", - {"name": token["name"]}) + endTagHandler = _utils.MethodDispatcher([("html", endTagHtml)]) + endTagHandler.default = endTagOther - startTagHandler = _utils.MethodDispatcher([ - ("html", Phase.startTagHtml), - ("noframes", startTagNoframes) - ]) - startTagHandler.default = startTagOther - endTagHandler = _utils.MethodDispatcher([ - ("html", endTagHtml) - ]) - endTagHandler.default = endTagOther +class InFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#in-frameset + __slots__ = tuple() - class AfterAfterBodyPhase(Phase): - __slots__ = tuple() + def processEOF(self): + if self.tree.openElements[-1].name != "html": + self.parser.parseError("eof-in-frameset") + else: + assert self.parser.innerHTML - def processEOF(self): - pass + def processCharacters(self, token): + self.parser.parseError("unexpected-char-in-frameset") - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) + def startTagFrameset(self, token): + self.tree.insertElement(token) - def processSpaceCharacters(self, token): - return self.parser.phases["inBody"].processSpaceCharacters(token) + def startTagFrame(self, token): + self.tree.insertElement(token) + self.tree.openElements.pop() - def processCharacters(self, token): - self.parser.parseError("expected-eof-but-got-char") - self.parser.phase = self.parser.phases["inBody"] - return token + def startTagNoframes(self, token): + return self.parser.phases["inBody"].processStartTag(token) - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-in-frameset", + {"name": token["name"]}) - def startTagOther(self, token): - self.parser.parseError("expected-eof-but-got-start-tag", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token + def endTagFrameset(self, token): + if self.tree.openElements[-1].name == "html": + # innerHTML case + self.parser.parseError("unexpected-frameset-in-frameset-innerhtml") + else: + self.tree.openElements.pop() + if (not self.parser.innerHTML and + self.tree.openElements[-1].name != "frameset"): + # If we're not in innerHTML mode and the current node is not a + # "frameset" element (anymore) then switch. + self.parser.phase = self.parser.phases["afterFrameset"] - def processEndTag(self, token): - self.parser.parseError("expected-eof-but-got-end-tag", - {"name": token["name"]}) - self.parser.phase = self.parser.phases["inBody"] - return token + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-in-frameset", + {"name": token["name"]}) - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml) - ]) - startTagHandler.default = startTagOther + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("frameset", startTagFrameset), + ("frame", startTagFrame), + ("noframes", startTagNoframes) + ]) + startTagHandler.default = startTagOther - class AfterAfterFramesetPhase(Phase): - __slots__ = tuple() + endTagHandler = _utils.MethodDispatcher([ + ("frameset", endTagFrameset) + ]) + endTagHandler.default = endTagOther - def processEOF(self): - pass - def processComment(self, token): - self.tree.insertComment(token, self.tree.document) +class AfterFramesetPhase(Phase): + # http://www.whatwg.org/specs/web-apps/current-work/#after3 + __slots__ = tuple() - def processSpaceCharacters(self, token): - return self.parser.phases["inBody"].processSpaceCharacters(token) + def processEOF(self): + # Stop parsing + pass - def processCharacters(self, token): - self.parser.parseError("expected-eof-but-got-char") + def processCharacters(self, token): + self.parser.parseError("unexpected-char-after-frameset") - def startTagHtml(self, token): - return self.parser.phases["inBody"].processStartTag(token) + def startTagNoframes(self, token): + return self.parser.phases["inHead"].processStartTag(token) - def startTagNoFrames(self, token): - return self.parser.phases["inHead"].processStartTag(token) + def startTagOther(self, token): + self.parser.parseError("unexpected-start-tag-after-frameset", + {"name": token["name"]}) - def startTagOther(self, token): - self.parser.parseError("expected-eof-but-got-start-tag", - {"name": token["name"]}) + def endTagHtml(self, token): + self.parser.phase = self.parser.phases["afterAfterFrameset"] - def processEndTag(self, token): - self.parser.parseError("expected-eof-but-got-end-tag", - {"name": token["name"]}) + def endTagOther(self, token): + self.parser.parseError("unexpected-end-tag-after-frameset", + {"name": token["name"]}) - startTagHandler = _utils.MethodDispatcher([ - ("html", startTagHtml), - ("noframes", startTagNoFrames) - ]) - startTagHandler.default = startTagOther - - # pylint:enable=unused-argument - - return { - "initial": InitialPhase, - "beforeHtml": BeforeHtmlPhase, - "beforeHead": BeforeHeadPhase, - "inHead": InHeadPhase, - "inHeadNoscript": InHeadNoscriptPhase, - "afterHead": AfterHeadPhase, - "inBody": InBodyPhase, - "text": TextPhase, - "inTable": InTablePhase, - "inTableText": InTableTextPhase, - "inCaption": InCaptionPhase, - "inColumnGroup": InColumnGroupPhase, - "inTableBody": InTableBodyPhase, - "inRow": InRowPhase, - "inCell": InCellPhase, - "inSelect": InSelectPhase, - "inSelectInTable": InSelectInTablePhase, - "inForeignContent": InForeignContentPhase, - "afterBody": AfterBodyPhase, - "inFrameset": InFramesetPhase, - "afterFrameset": AfterFramesetPhase, - "afterAfterBody": AfterAfterBodyPhase, - "afterAfterFrameset": AfterAfterFramesetPhase, - # XXX after after frameset - } + startTagHandler = _utils.MethodDispatcher([ + ("html", Phase.startTagHtml), + ("noframes", startTagNoframes) + ]) + startTagHandler.default = startTagOther + + endTagHandler = _utils.MethodDispatcher([ + ("html", endTagHtml) + ]) + endTagHandler.default = endTagOther + + +class AfterAfterBodyPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + return self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + self.parser.phase = self.parser.phases["inBody"] + return token + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + self.parser.phase = self.parser.phases["inBody"] + return token + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml) + ]) + startTagHandler.default = startTagOther + + +class AfterAfterFramesetPhase(Phase): + __slots__ = tuple() + + def processEOF(self): + pass + + def processComment(self, token): + self.tree.insertComment(token, self.tree.document) + + def processSpaceCharacters(self, token): + return self.parser.phases["inBody"].processSpaceCharacters(token) + + def processCharacters(self, token): + self.parser.parseError("expected-eof-but-got-char") + + def startTagHtml(self, token): + return self.parser.phases["inBody"].processStartTag(token) + + def startTagNoFrames(self, token): + return self.parser.phases["inHead"].processStartTag(token) + + def startTagOther(self, token): + self.parser.parseError("expected-eof-but-got-start-tag", + {"name": token["name"]}) + + def processEndTag(self, token): + self.parser.parseError("expected-eof-but-got-end-tag", + {"name": token["name"]}) + + startTagHandler = _utils.MethodDispatcher([ + ("html", startTagHtml), + ("noframes", startTagNoFrames) + ]) + startTagHandler.default = startTagOther + +# pylint:enable=unused-argument + + +_phases = { + "initial": InitialPhase, + "beforeHtml": BeforeHtmlPhase, + "beforeHead": BeforeHeadPhase, + "inHead": InHeadPhase, + "inHeadNoscript": InHeadNoscriptPhase, + "afterHead": AfterHeadPhase, + "inBody": InBodyPhase, + "text": TextPhase, + "inTable": InTablePhase, + "inTableText": InTableTextPhase, + "inCaption": InCaptionPhase, + "inColumnGroup": InColumnGroupPhase, + "inTableBody": InTableBodyPhase, + "inRow": InRowPhase, + "inCell": InCellPhase, + "inSelect": InSelectPhase, + "inSelectInTable": InSelectInTablePhase, + "inForeignContent": InForeignContentPhase, + "afterBody": AfterBodyPhase, + "inFrameset": InFramesetPhase, + "afterFrameset": AfterFramesetPhase, + "afterAfterBody": AfterAfterBodyPhase, + "afterAfterFrameset": AfterAfterFramesetPhase, + # XXX after after frameset +} def adjust_attributes(token, replacements): diff --git a/html5lib/tests/test_parser2.py b/html5lib/tests/test_parser2.py index 879d2447..6b464bea 100644 --- a/html5lib/tests/test_parser2.py +++ b/html5lib/tests/test_parser2.py @@ -68,7 +68,6 @@ def test_debug_log(): ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'p', 'type': 'StartTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), - ('dataState', 'InBodyPhase', 'InHeadPhase', 'processStartTag', {'name': 'script', 'type': 'StartTag'}), ('scriptDataState', 'TextPhase', 'TextPhase', 'processCharacters', {'type': 'Characters'}), ('dataState', 'TextPhase', 'TextPhase', 'processEndTag', {'name': 'script', 'type': 'EndTag'}), ('dataState', 'InBodyPhase', 'InBodyPhase', 'processCharacters', {'type': 'Characters'}),