diff --git a/html5lib/treebuilders/etree.py b/html5lib/treebuilders/etree.py index 0b745081..c0507980 100644 --- a/html5lib/treebuilders/etree.py +++ b/html5lib/treebuilders/etree.py @@ -16,6 +16,23 @@ tag_regexp = re.compile("{([^}]*)}(.*)") +class TextBuffer: + def __init__(self, initial=""): + self.chunks = [initial] + + def __str__(self): + return "".join(self.chunks) + + def getvalue(self): + return "".join(self.chunks) + + def append(self, other): + self.chunks.append(other) + + def __eq__(self, other): + return self.getvalue() == other + + def getETreeBuilder(ElementTreeImplementation, fullTree=False): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag @@ -110,25 +127,25 @@ def removeChild(self, node): def insertText(self, data, insertBefore=None): if not len(self._element): if not self._element.text: - self._element.text = "" - self._element.text += data + self._element.text = TextBuffer("") + self._element.text.append(data) elif insertBefore is None: # Insert the text as the tail of the last child element if not self._element[-1].tail: - self._element[-1].tail = "" - self._element[-1].tail += data + self._element[-1].tail = TextBuffer("") + self._element[-1].tail.append(data) else: # Insert the text before the specified node children = list(self._element) index = children.index(insertBefore._element) if index > 0: if not self._element[index - 1].tail: - self._element[index - 1].tail = "" - self._element[index - 1].tail += data + self._element[index - 1].tail = TextBuffer("") + self._element[index - 1].tail.append(data) else: if not self._element.text: - self._element.text = "" - self._element.text += data + self._element.text = TextBuffer("") + self._element.text.append(data) def cloneNode(self): element = type(self)(self.name, self.namespace) @@ -138,13 +155,15 @@ def cloneNode(self): def reparentChildren(self, newParent): if newParent.childNodes: - newParent.childNodes[-1]._element.tail += self._element.text + newParent.childNodes[-1]._element.tail.append( + self._element.text.getvalue() + ) else: if not newParent._element.text: - newParent._element.text = "" + newParent._element.text = TextBuffer("") if self._element.text is not None: - newParent._element.text += self._element.text - self._element.text = "" + newParent._element.text.append(self._element.text.getvalue()) + self._element.text = TextBuffer("") base.Node.reparentChildren(self, newParent) class Comment(Element): @@ -152,22 +171,23 @@ def __init__(self, data): # Use the superclass constructor to set all properties on the # wrapper element self._element = ElementTree.Comment(data) + self._element.text = TextBuffer(data) self.parent = None self._childNodes = [] self._flags = [] def _getData(self): - return self._element.text + return self._element.text.getvalue() def _setData(self, value): - self._element.text = value + self._element.text = TextBuffer(value) data = property(_getData, _setData) class DocumentType(Element): def __init__(self, name, publicId, systemId): Element.__init__(self, "<!DOCTYPE>") - self._element.text = name + self._element.text = TextBuffer(name) self.publicId = publicId self.systemId = systemId @@ -208,19 +228,19 @@ def serializeElement(element, indent=0): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" rv.append("""<!DOCTYPE %s "%s" "%s">""" % - (element.text, publicId, systemId)) + (element.text.getvalue(), publicId, systemId)) else: - rv.append("<!DOCTYPE %s>" % (element.text,)) + rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),)) elif element.tag == "DOCUMENT_ROOT": rv.append("#document") if element.text is not None: - rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue())) if element.tail is not None: raise TypeError("Document node cannot have tail") if hasattr(element, "attrib") and len(element.attrib): raise TypeError("Document node cannot have attributes") elif element.tag == ElementTreeCommentType: - rv.append("|%s<!-- %s -->" % (' ' * indent, element.text)) + rv.append("|%s<!-- %s -->" % (' ' * indent, element.text.getvalue())) else: assert isinstance(element.tag, text_type), \ "Expected unicode, got %s, %s" % (type(element.tag), element.tag) @@ -248,13 +268,14 @@ def serializeElement(element, indent=0): for name, value in sorted(attributes): rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value)) - if element.text: - rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text)) + if element.text and element.text.getvalue(): + rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue())) indent += 2 for child in element: serializeElement(child, indent) if element.tail: - rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail)) + rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue())) + serializeElement(element, 0) return "\n".join(rv) @@ -272,13 +293,15 @@ def serializeElement(element): if element.get("publicId") or element.get("systemId"): publicId = element.get("publicId") or "" systemId = element.get("systemId") or "" - rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" % - (element.text, publicId, systemId)) + rv.append( + """<!DOCTYPE %s PUBLIC "%s" "%s">""" + % (element.text.getvalue(), publicId, systemId) + ) else: - rv.append("<!DOCTYPE %s>" % (element.text,)) + rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),)) elif element.tag == "DOCUMENT_ROOT": if element.text is not None: - rv.append(element.text) + rv.append(element.text.getvalue()) if element.tail is not None: raise TypeError("Document node cannot have tail") if hasattr(element, "attrib") and len(element.attrib): @@ -288,7 +311,7 @@ def serializeElement(element): serializeElement(child) elif element.tag == ElementTreeCommentType: - rv.append("<!--%s-->" % (element.text,)) + rv.append("<!--%s-->" % (element.text.getvalue(),)) else: # This is assumed to be an ordinary element if not element.attrib: @@ -299,7 +322,7 @@ def serializeElement(element): for name, value in element.attrib.items()]) rv.append("<%s %s>" % (element.tag, attr)) if element.text: - rv.append(element.text) + rv.append(element.text.getvalue()) for child in element: serializeElement(child) @@ -307,7 +330,7 @@ def serializeElement(element): rv.append("</%s>" % (element.tag,)) if element.tail: - rv.append(element.tail) + rv.append(element.tail.getvalue()) serializeElement(element) diff --git a/html5lib/treewalkers/etree.py b/html5lib/treewalkers/etree.py index 411a1d45..47c8577e 100644 --- a/html5lib/treewalkers/etree.py +++ b/html5lib/treewalkers/etree.py @@ -33,7 +33,7 @@ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element elt, _, _, flag = node if flag in ("text", "tail"): - return base.TEXT, getattr(elt, flag) + return base.TEXT, getattr(elt, flag).getvalue() else: node = elt @@ -44,11 +44,15 @@ def getNodeDetails(self, node): return (base.DOCUMENT,) elif node.tag == "<!DOCTYPE>": - return (base.DOCTYPE, node.text, - node.get("publicId"), node.get("systemId")) + return ( + base.DOCTYPE, + node.text.getvalue(), + node.get("publicId"), + node.get("systemId"), + ) elif node.tag == ElementTreeCommentType: - return base.COMMENT, node.text + return base.COMMENT, node.text.getvalue() else: assert isinstance(node.tag, string_types), type(node.tag)