Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make parsing of text be non-quadratic. #579

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 52 additions & 29 deletions html5lib/treebuilders/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,23 @@
tag_regexp = re.compile("{([^}]*)}(.*)")


class TextBuffer:
def __init__(self, initial=""):
self.chunks = [initial]

def __str__(self):
return "".join(self.chunks)

def getvalue(self):
return "".join(self.chunks)

def append(self, other):
self.chunks.append(other)

def __eq__(self, other):
return self.getvalue() == other


def getETreeBuilder(ElementTreeImplementation, fullTree=False):
ElementTree = ElementTreeImplementation
ElementTreeCommentType = ElementTree.Comment("asd").tag
Expand Down Expand Up @@ -110,25 +127,25 @@ def removeChild(self, node):
def insertText(self, data, insertBefore=None):
if not len(self._element):
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)
elif insertBefore is None:
# Insert the text as the tail of the last child element
if not self._element[-1].tail:
self._element[-1].tail = ""
self._element[-1].tail += data
self._element[-1].tail = TextBuffer("")
self._element[-1].tail.append(data)
else:
# Insert the text before the specified node
children = list(self._element)
index = children.index(insertBefore._element)
if index > 0:
if not self._element[index - 1].tail:
self._element[index - 1].tail = ""
self._element[index - 1].tail += data
self._element[index - 1].tail = TextBuffer("")
self._element[index - 1].tail.append(data)
else:
if not self._element.text:
self._element.text = ""
self._element.text += data
self._element.text = TextBuffer("")
self._element.text.append(data)

def cloneNode(self):
element = type(self)(self.name, self.namespace)
Expand All @@ -138,36 +155,39 @@ def cloneNode(self):

def reparentChildren(self, newParent):
if newParent.childNodes:
newParent.childNodes[-1]._element.tail += self._element.text
newParent.childNodes[-1]._element.tail.append(
self._element.text.getvalue()
)
else:
if not newParent._element.text:
newParent._element.text = ""
newParent._element.text = TextBuffer("")
if self._element.text is not None:
newParent._element.text += self._element.text
self._element.text = ""
newParent._element.text.append(self._element.text.getvalue())
self._element.text = TextBuffer("")
base.Node.reparentChildren(self, newParent)

class Comment(Element):
def __init__(self, data):
# Use the superclass constructor to set all properties on the
# wrapper element
self._element = ElementTree.Comment(data)
self._element.text = TextBuffer(data)
self.parent = None
self._childNodes = []
self._flags = []

def _getData(self):
return self._element.text
return self._element.text.getvalue()

def _setData(self, value):
self._element.text = value
self._element.text = TextBuffer(value)

data = property(_getData, _setData)

class DocumentType(Element):
def __init__(self, name, publicId, systemId):
Element.__init__(self, "<!DOCTYPE>")
self._element.text = name
self._element.text = TextBuffer(name)
self.publicId = publicId
self.systemId = systemId

Expand Down Expand Up @@ -208,19 +228,19 @@ def serializeElement(element, indent=0):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s "%s" "%s">""" %
(element.text, publicId, systemId))
(element.text.getvalue(), publicId, systemId))
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
rv.append("#document")
if element.text is not None:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
raise TypeError("Document node cannot have attributes")
elif element.tag == ElementTreeCommentType:
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text))
rv.append("|%s<!-- %s -->" % (' ' * indent, element.text.getvalue()))
else:
assert isinstance(element.tag, text_type), \
"Expected unicode, got %s, %s" % (type(element.tag), element.tag)
Expand Down Expand Up @@ -248,13 +268,14 @@ def serializeElement(element, indent=0):

for name, value in sorted(attributes):
rv.append('|%s%s="%s"' % (' ' * (indent + 2), name, value))
if element.text:
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text))
if element.text and element.text.getvalue():
rv.append("|%s\"%s\"" % (' ' * (indent + 2), element.text.getvalue()))
indent += 2
for child in element:
serializeElement(child, indent)
if element.tail:
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail))
rv.append("|%s\"%s\"" % (' ' * (indent - 2), element.tail.getvalue()))

serializeElement(element, 0)

return "\n".join(rv)
Expand All @@ -272,13 +293,15 @@ def serializeElement(element):
if element.get("publicId") or element.get("systemId"):
publicId = element.get("publicId") or ""
systemId = element.get("systemId") or ""
rv.append("""<!DOCTYPE %s PUBLIC "%s" "%s">""" %
(element.text, publicId, systemId))
rv.append(
"""<!DOCTYPE %s PUBLIC "%s" "%s">"""
% (element.text.getvalue(), publicId, systemId)
)
else:
rv.append("<!DOCTYPE %s>" % (element.text,))
rv.append("<!DOCTYPE %s>" % (element.text.getvalue(),))
elif element.tag == "DOCUMENT_ROOT":
if element.text is not None:
rv.append(element.text)
rv.append(element.text.getvalue())
if element.tail is not None:
raise TypeError("Document node cannot have tail")
if hasattr(element, "attrib") and len(element.attrib):
Expand All @@ -288,7 +311,7 @@ def serializeElement(element):
serializeElement(child)

elif element.tag == ElementTreeCommentType:
rv.append("<!--%s-->" % (element.text,))
rv.append("<!--%s-->" % (element.text.getvalue(),))
else:
# This is assumed to be an ordinary element
if not element.attrib:
Expand All @@ -299,15 +322,15 @@ def serializeElement(element):
for name, value in element.attrib.items()])
rv.append("<%s %s>" % (element.tag, attr))
if element.text:
rv.append(element.text)
rv.append(element.text.getvalue())

for child in element:
serializeElement(child)

rv.append("</%s>" % (element.tag,))

if element.tail:
rv.append(element.tail)
rv.append(element.tail.getvalue())

serializeElement(element)

Expand Down
12 changes: 8 additions & 4 deletions html5lib/treewalkers/etree.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def getNodeDetails(self, node):
if isinstance(node, tuple): # It might be the root Element
elt, _, _, flag = node
if flag in ("text", "tail"):
return base.TEXT, getattr(elt, flag)
return base.TEXT, getattr(elt, flag).getvalue()
else:
node = elt

Expand All @@ -44,11 +44,15 @@ def getNodeDetails(self, node):
return (base.DOCUMENT,)

elif node.tag == "<!DOCTYPE>":
return (base.DOCTYPE, node.text,
node.get("publicId"), node.get("systemId"))
return (
base.DOCTYPE,
node.text.getvalue(),
node.get("publicId"),
node.get("systemId"),
)

elif node.tag == ElementTreeCommentType:
return base.COMMENT, node.text
return base.COMMENT, node.text.getvalue()

else:
assert isinstance(node.tag, string_types), type(node.tag)
Expand Down
Loading